From 24cbbb393875b4d5d9415235bfdb670de7469b20 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Tue, 4 Mar 2025 16:18:34 +0000 Subject: [PATCH 01/12] [AArch64] Make use of byte FPR stores bytes extracted from vectors This helps avoid some pointless `fmovs` in some cases. Currently, this is done in ISEL as FPR bytes are problematic in SDAG (as neither GPR or FPR bytes are a legal type). --- llvm/include/llvm/CodeGen/ValueTypes.td | 2 + llvm/lib/CodeGen/ValueTypes.cpp | 2 + .../Target/AArch64/AArch64ISelLowering.cpp | 1 + llvm/lib/Target/AArch64/AArch64InstrInfo.td | 11 +- .../lib/Target/AArch64/AArch64RegisterInfo.td | 2 +- .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 37 ++++ .../CodeGen/AArch64/aarch64-sve-ldst-one.ll | 159 ++++++++++++++---- llvm/test/CodeGen/AArch64/add.ll | 3 +- llvm/test/CodeGen/AArch64/andorxor.ll | 9 +- .../test/CodeGen/AArch64/arm64-collect-loh.ll | 9 +- llvm/test/CodeGen/AArch64/arm64-st1.ll | 51 ++++-- llvm/test/CodeGen/AArch64/bitcast-v2i8.ll | 3 +- llvm/test/CodeGen/AArch64/ctlz.ll | 3 +- llvm/test/CodeGen/AArch64/ctpop.ll | 3 +- llvm/test/CodeGen/AArch64/cttz.ll | 3 +- .../CodeGen/AArch64/extract-vector-cmp.ll | 7 +- llvm/test/CodeGen/AArch64/mul.ll | 3 +- llvm/test/CodeGen/AArch64/neon-truncstore.ll | 6 +- llvm/test/CodeGen/AArch64/nontemporal-load.ll | 3 +- llvm/test/CodeGen/AArch64/pr-cf624b2.ll | 6 +- llvm/test/CodeGen/AArch64/sadd_sat_vec.ll | 5 +- .../CodeGen/AArch64/setcc-type-mismatch.ll | 3 +- llvm/test/CodeGen/AArch64/ssub_sat_vec.ll | 5 +- llvm/test/CodeGen/AArch64/store.ll | 3 +- llvm/test/CodeGen/AArch64/sub.ll | 3 +- ...-streaming-mode-fixed-length-ld2-alloca.ll | 9 +- ...mode-fixed-length-masked-gather-scatter.ll | 12 +- llvm/test/CodeGen/AArch64/uadd_sat_vec.ll | 5 +- llvm/test/CodeGen/AArch64/usub_sat_vec.ll | 5 +- .../vec-combine-compare-truncate-store.ll | 11 +- .../AArch64/vec3-loads-ext-trunc-stores.ll | 26 ++- llvm/test/CodeGen/AArch64/vector-compress.ll | 2 +- 32 files changed, 273 insertions(+), 139 deletions(-) diff --git a/llvm/include/llvm/CodeGen/ValueTypes.td b/llvm/include/llvm/CodeGen/ValueTypes.td index fc1a95e33380b..42c4830e94220 100644 --- a/llvm/include/llvm/CodeGen/ValueTypes.td +++ b/llvm/include/llvm/CodeGen/ValueTypes.td @@ -338,6 +338,8 @@ def amdgpuBufferFatPointer : ValueType<160, 234>; // FIXME: Remove this and the getPointerType() override if MVT::i82 is added. def amdgpuBufferStridedPointer : ValueType<192, 235>; +def vi8 : ValueType<8, 236>; // 8-bit integer in FPR (AArch64) + let isNormalValueType = false in { def token : ValueType<0, 504>; // TokenTy def MetadataVT : ValueType<0, 505> { // Metadata diff --git a/llvm/lib/CodeGen/ValueTypes.cpp b/llvm/lib/CodeGen/ValueTypes.cpp index 0554b6387c5e6..c769568253b12 100644 --- a/llvm/lib/CodeGen/ValueTypes.cpp +++ b/llvm/lib/CodeGen/ValueTypes.cpp @@ -198,6 +198,8 @@ std::string EVT::getEVTString() const { return "amdgpuBufferFatPointer"; case MVT::amdgpuBufferStridedPointer: return "amdgpuBufferStridedPointer"; + case MVT::vi8: + return "vi8"; } } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index e366d7cb54490..4505022b884ca 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -401,6 +401,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, } if (Subtarget->hasFPARMv8()) { + addRegisterClass(MVT::vi8, &AArch64::FPR8RegClass); addRegisterClass(MVT::f16, &AArch64::FPR16RegClass); addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass); addRegisterClass(MVT::f32, &AArch64::FPR32RegClass); diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index a2d98a0862988..2be7d90579654 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -3587,7 +3587,7 @@ defm LDRW : LoadUI<0b10, 0, 0b01, GPR32z, uimm12s4, "ldr", (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)))]>; let Predicates = [HasFPARMv8] in { defm LDRB : LoadUI<0b00, 1, 0b01, FPR8Op, uimm12s1, "ldr", - [(set FPR8Op:$Rt, + [(set (i8 FPR8Op:$Rt), (load (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset)))]>; defm LDRH : LoadUI<0b01, 1, 0b01, FPR16Op, uimm12s2, "ldr", [(set (f16 FPR16Op:$Rt), @@ -3775,7 +3775,7 @@ defm LDURW : LoadUnscaled<0b10, 0, 0b01, GPR32z, "ldur", (load (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>; let Predicates = [HasFPARMv8] in { defm LDURB : LoadUnscaled<0b00, 1, 0b01, FPR8Op, "ldur", - [(set FPR8Op:$Rt, + [(set (i8 FPR8Op:$Rt), (load (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>; defm LDURH : LoadUnscaled<0b01, 1, 0b01, FPR16Op, "ldur", [(set (f16 FPR16Op:$Rt), @@ -4345,7 +4345,7 @@ defm STRW : StoreUIz<0b10, 0, 0b00, GPR32z, uimm12s4, "str", (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))]>; let Predicates = [HasFPARMv8] in { defm STRB : StoreUI<0b00, 1, 0b00, FPR8Op, uimm12s1, "str", - [(store FPR8Op:$Rt, + [(store (i8 FPR8Op:$Rt), (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))]>; defm STRH : StoreUI<0b01, 1, 0b00, FPR16Op, uimm12s2, "str", [(store (f16 FPR16Op:$Rt), @@ -4463,6 +4463,8 @@ multiclass VecStoreLane0Pat; + defm : VecStoreLane0Pat; defm : VecStoreLane0Pat; defm : VecStoreLane0Pat; defm : VecStoreLane0Pat; @@ -4481,7 +4483,7 @@ defm STURW : StoreUnscaled<0b10, 0, 0b00, GPR32z, "stur", (am_unscaled32 GPR64sp:$Rn, simm9:$offset))]>; let Predicates = [HasFPARMv8] in { defm STURB : StoreUnscaled<0b00, 1, 0b00, FPR8Op, "stur", - [(store FPR8Op:$Rt, + [(store (i8 FPR8Op:$Rt), (am_unscaled8 GPR64sp:$Rn, simm9:$offset))]>; defm STURH : StoreUnscaled<0b01, 1, 0b00, FPR16Op, "stur", [(store (f16 FPR16Op:$Rt), @@ -4610,6 +4612,7 @@ multiclass VecStoreULane0Pat; defm : VecStoreULane0Pat; defm : VecStoreULane0Pat; defm : VecStoreULane0Pat; diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td index fed9b7b173e9c..42ba1451650ed 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td @@ -497,7 +497,7 @@ def Q30 : AArch64Reg<30, "q30", [D30, D30_HI], ["v30", ""]>, DwarfRegAlias, DwarfRegAlias; } -def FPR8 : RegisterClass<"AArch64", [i8], 8, (sequence "B%u", 0, 31)> { +def FPR8 : RegisterClass<"AArch64", [i8, vi8], 8, (sequence "B%u", 0, 31)> { let Size = 8; let DecoderMethod = "DecodeSimpleRegisterClass"; } diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index a2f326c994c2f..a8aacf95dc1ff 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -1825,6 +1825,43 @@ let Predicates = [HasSVE] in { defm : adrXtwShiftPat; } // End HasSVE +multiclass SVEVecStoreLanePat { + let Predicates = [HasSVE_or_SME] in { + // Same as Neon VecStoreLane0Pat but without matching VecListOne128. + def : Pat<(storeop (STy (vector_extract VTy:$Vt, (i64 0))), + (UIAddrMode GPR64sp:$Rn, IndexType:$offset)), + (STR (SubRegTy (EXTRACT_SUBREG $Vt, SubRegIdx)), + GPR64sp:$Rn, IndexType:$offset)>; + } + + // Non-zero immediate index: + def : Pat<(storeop (STy (vector_extract VTy:$Vt, DUPIdxTy:$idx)), + (UIAddrMode GPR64sp:$Rn, IndexType:$offset)), + (STR (SubRegTy (EXTRACT_SUBREG (DUP $Vt, DUPIdxTy:$idx), SubRegIdx)), + GPR64sp:$Rn, IndexType:$offset)>; +} + +// Note: Types other than i8 are handled in performSTORECombine -- i8 is tricky +// to handle before ISEL as it is not really a legal type in many places, nor +// is its equivalently sized FP variant. +let AddedComplexity = 19 in { + // Lane 0 truncating stores + // i32 -> i8 + defm : SVEVecStoreLanePat; + defm : SVEVecStoreLanePat; + // i64 -> i8 + defm : SVEVecStoreLanePat; + defm : SVEVecStoreLanePat; + // i8 -> i8 (technically a truncate as the extracted type is i32) + defm : SVEVecStoreLanePat; + defm : SVEVecStoreLanePat; +} + let Predicates = [HasSVE_or_SME] in { defm TBL_ZZZ : sve_int_perm_tbl<"tbl", AArch64tbl>; diff --git a/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll index d39c9bf760621..b91cb872a9e0a 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 | FileCheck %s --check-prefixes=CHECK,CHECK-NONSTREAMING -; RUN: llc < %s -verify-machineinstrs -mattr=+sme -global-isel=0 -force-streaming | FileCheck %s --check-prefixes=CHECK,STREAMING-COMPAT -; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 -force-streaming-compatible | FileCheck %s --check-prefixes=CHECK,STREAMING-COMPAT +; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 | FileCheck %s +; RUN: llc < %s -verify-machineinstrs -mattr=+sme -global-isel=0 -force-streaming | FileCheck %s +; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 -force-streaming-compatible | FileCheck %s target triple = "aarch64-unknown-linux-gnu" @@ -106,18 +106,11 @@ entry: } define void @test_str_lane_s8(ptr %a, %b) { -; CHECK-NONSTREAMING-LABEL: test_str_lane_s8: -; CHECK-NONSTREAMING: // %bb.0: // %entry -; CHECK-NONSTREAMING-NEXT: umov w8, v0.b[7] -; CHECK-NONSTREAMING-NEXT: strb w8, [x0] -; CHECK-NONSTREAMING-NEXT: ret -; -; STREAMING-COMPAT-LABEL: test_str_lane_s8: -; STREAMING-COMPAT: // %bb.0: // %entry -; STREAMING-COMPAT-NEXT: mov z0.b, z0.b[7] -; STREAMING-COMPAT-NEXT: fmov w8, s0 -; STREAMING-COMPAT-NEXT: strb w8, [x0] -; STREAMING-COMPAT-NEXT: ret +; CHECK-LABEL: test_str_lane_s8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.b, z0.b[7] +; CHECK-NEXT: str b0, [x0] +; CHECK-NEXT: ret entry: %0 = extractelement %b, i32 7 @@ -128,8 +121,7 @@ entry: define void @test_str_lane0_s8(ptr %a, %b) { ; CHECK-LABEL: test_str_lane0_s8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: strb w8, [x0] +; CHECK-NEXT: str b0, [x0] ; CHECK-NEXT: ret entry: @@ -201,6 +193,19 @@ define void @test_str_reduction_i32_to_i16(ptr %ptr, %p0, %p0, %v) { +; CHECK-LABEL: test_str_reduction_i32_to_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: uaddv d0, p0, z0.s +; CHECK-NEXT: str b0, [x0] +; CHECK-NEXT: ret + + %reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32( %p0, %v) + %trunc = trunc i64 %reduce to i8 + store i8 %trunc, ptr %ptr, align 1 + ret void +} + define void @test_str_reduction_i32_to_i32_negative_offset(ptr %ptr, %p0, %v) { ; CHECK-LABEL: test_str_reduction_i32_to_i32_negative_offset: ; CHECK: // %bb.0: @@ -242,6 +247,20 @@ define void @test_str_reduction_i32_to_i16_negative_offset(ptr %ptr, %p0, %v) { +; CHECK-LABEL: test_str_reduction_i32_to_i8_negative_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: uaddv d0, p0, z0.s +; CHECK-NEXT: stur b0, [x0, #-8] +; CHECK-NEXT: ret + + %reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32( %p0, %v) + %trunc = trunc i64 %reduce to i8 + %out_ptr = getelementptr inbounds i8, ptr %ptr, i64 -8 + store i8 %trunc, ptr %out_ptr, align 1 + ret void +} + define void @test_str_lane_s32_negative_offset(ptr %a, %b) { ; CHECK-LABEL: test_str_lane_s32_negative_offset: ; CHECK: // %bb.0: // %entry @@ -297,18 +316,11 @@ entry: } define void @test_str_lane_s8_negative_offset(ptr %a, %b) { -; CHECK-NONSTREAMING-LABEL: test_str_lane_s8_negative_offset: -; CHECK-NONSTREAMING: // %bb.0: // %entry -; CHECK-NONSTREAMING-NEXT: umov w8, v0.b[7] -; CHECK-NONSTREAMING-NEXT: sturb w8, [x0, #-8] -; CHECK-NONSTREAMING-NEXT: ret -; -; STREAMING-COMPAT-LABEL: test_str_lane_s8_negative_offset: -; STREAMING-COMPAT: // %bb.0: // %entry -; STREAMING-COMPAT-NEXT: mov z0.b, z0.b[7] -; STREAMING-COMPAT-NEXT: fmov w8, s0 -; STREAMING-COMPAT-NEXT: sturb w8, [x0, #-8] -; STREAMING-COMPAT-NEXT: ret +; CHECK-LABEL: test_str_lane_s8_negative_offset: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.b, z0.b[7] +; CHECK-NEXT: stur b0, [x0, #-8] +; CHECK-NEXT: ret entry: %0 = extractelement %b, i32 7 @@ -320,8 +332,7 @@ entry: define void @test_str_lane0_s8_negative_offset(ptr %a, %b) { ; CHECK-LABEL: test_str_lane0_s8_negative_offset: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: sturb w8, [x0, #-8] +; CHECK-NEXT: stur b0, [x0, #-8] ; CHECK-NEXT: ret entry: @@ -385,6 +396,48 @@ entry: ret void } + +define void @test_str_trunc_lane_s32_to_s8(ptr %a, %b) { +; CHECK-LABEL: test_str_trunc_lane_s32_to_s8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.s, z0.s[3] +; CHECK-NEXT: str b0, [x0] +; CHECK-NEXT: ret + +entry: + %0 = extractelement %b, i32 3 + %trunc = trunc i32 %0 to i8 + store i8 %trunc, ptr %a, align 1 + ret void +} + +define void @test_str_trunc_lane0_s32_to_s8(ptr %a, %b) { +; CHECK-LABEL: test_str_trunc_lane0_s32_to_s8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str b0, [x0] +; CHECK-NEXT: ret + +entry: + %0 = extractelement %b, i32 0 + %trunc = trunc i32 %0 to i8 + store i8 %trunc, ptr %a, align 1 + ret void +} + +define void @test_str_trunc_lane_s64_to_s8(ptr %a, %b) { +; CHECK-LABEL: test_str_trunc_lane_s64_to_s8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.d, z0.d[3] +; CHECK-NEXT: str b0, [x0] +; CHECK-NEXT: ret + +entry: + %0 = extractelement %b, i32 3 + %trunc = trunc i64 %0 to i8 + store i8 %trunc, ptr %a, align 1 + ret void +} + define void @test_str_trunc_lane_s32_to_s16_negative_offset(ptr %a, %b) { ; CHECK-LABEL: test_str_trunc_lane_s32_to_s16_negative_offset: ; CHECK: // %bb.0: // %entry @@ -413,3 +466,47 @@ entry: store i16 %trunc, ptr %out_ptr, align 2 ret void } + +define void @test_str_trunc_lane_s32_to_s8_negative_offset(ptr %a, %b) { +; CHECK-LABEL: test_str_trunc_lane_s32_to_s8_negative_offset: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.s, z0.s[3] +; CHECK-NEXT: stur b0, [x0, #-8] +; CHECK-NEXT: ret + +entry: + %0 = extractelement %b, i32 3 + %trunc = trunc i32 %0 to i8 + %out_ptr = getelementptr inbounds i8, ptr %a, i64 -8 + store i8 %trunc, ptr %out_ptr, align 1 + ret void +} + +define void @test_str_trunc_lane0_s32_to_s8_negative_offset(ptr %a, %b) { +; CHECK-LABEL: test_str_trunc_lane0_s32_to_s8_negative_offset: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stur b0, [x0, #-8] +; CHECK-NEXT: ret + +entry: + %0 = extractelement %b, i32 0 + %trunc = trunc i32 %0 to i8 + %out_ptr = getelementptr inbounds i8, ptr %a, i64 -8 + store i8 %trunc, ptr %out_ptr, align 1 + ret void +} + +define void @test_str_trunc_lane_s64_to_s8_negative_offset(ptr %a, %b) { +; CHECK-LABEL: test_str_trunc_lane_s64_to_s8_negative_offset: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.d, z0.d[3] +; CHECK-NEXT: stur b0, [x0, #-8] +; CHECK-NEXT: ret + +entry: + %0 = extractelement %b, i32 3 + %trunc = trunc i64 %0 to i8 + %out_ptr = getelementptr inbounds i8, ptr %a, i64 -8 + store i8 %trunc, ptr %out_ptr, align 1 + ret void +} diff --git a/llvm/test/CodeGen/AArch64/add.ll b/llvm/test/CodeGen/AArch64/add.ll index fc0ba336b21cc..cdde359d09d7b 100644 --- a/llvm/test/CodeGen/AArch64/add.ll +++ b/llvm/test/CodeGen/AArch64/add.ll @@ -64,8 +64,7 @@ define void @v2i8(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9] ; CHECK-SD-NEXT: add v0.2s, v0.2s, v1.2s ; CHECK-SD-NEXT: mov w8, v0.s[1] -; CHECK-SD-NEXT: fmov w9, s0 -; CHECK-SD-NEXT: strb w9, [x0] +; CHECK-SD-NEXT: str b0, [x0] ; CHECK-SD-NEXT: strb w8, [x0, #1] ; CHECK-SD-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/andorxor.ll b/llvm/test/CodeGen/AArch64/andorxor.ll index 0384848082caa..e11c62ba70de4 100644 --- a/llvm/test/CodeGen/AArch64/andorxor.ll +++ b/llvm/test/CodeGen/AArch64/andorxor.ll @@ -184,8 +184,7 @@ define void @and_v2i8(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9] ; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: mov w8, v0.s[1] -; CHECK-SD-NEXT: fmov w9, s0 -; CHECK-SD-NEXT: strb w9, [x0] +; CHECK-SD-NEXT: str b0, [x0] ; CHECK-SD-NEXT: strb w8, [x0, #1] ; CHECK-SD-NEXT: ret ; @@ -221,8 +220,7 @@ define void @or_v2i8(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9] ; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: mov w8, v0.s[1] -; CHECK-SD-NEXT: fmov w9, s0 -; CHECK-SD-NEXT: strb w9, [x0] +; CHECK-SD-NEXT: str b0, [x0] ; CHECK-SD-NEXT: strb w8, [x0, #1] ; CHECK-SD-NEXT: ret ; @@ -258,8 +256,7 @@ define void @xor_v2i8(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9] ; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: mov w8, v0.s[1] -; CHECK-SD-NEXT: fmov w9, s0 -; CHECK-SD-NEXT: strb w9, [x0] +; CHECK-SD-NEXT: str b0, [x0] ; CHECK-SD-NEXT: strb w8, [x0, #1] ; CHECK-SD-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll b/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll index 2c065e0051cd7..7f2bebf584d8f 100644 --- a/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll +++ b/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll @@ -615,11 +615,10 @@ define <1 x i8> @getL() { ; CHECK-NEXT: ; kill ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]: ; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], [[[ADRP_REG]], _L@GOTPAGEOFF] -; Ultimately we should generate str b0, but right now, we match the vector -; variant which does not allow to fold the immediate into the store. -; CHECK-NEXT: st1.b { v0 }[0], [x[[LDRGOT_REG]]] +; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]: +; CHECK-NEXT: str b0, [x[[LDRGOT_REG]]] ; CHECK-NEXT: ret -; CHECK: .loh AdrpLdrGot [[ADRP_LABEL]], [[LDRGOT_LABEL]] +; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[STR_LABEL]] define void @setL(<1 x i8> %t) { store <1 x i8> %t, ptr @L, align 4 ret void @@ -678,6 +677,6 @@ if.end.i: call void (ptr, ...) @callee(ptr @.str.89, ptr @.str.90, double %sub) unreachable } -declare void @callee(ptr nocapture readonly, ...) +declare void @callee(ptr nocapture readonly, ...) attributes #0 = { "target-cpu"="cyclone" } diff --git a/llvm/test/CodeGen/AArch64/arm64-st1.ll b/llvm/test/CodeGen/AArch64/arm64-st1.ll index 02797f3ed186c..df37231a086f0 100644 --- a/llvm/test/CodeGen/AArch64/arm64-st1.ll +++ b/llvm/test/CodeGen/AArch64/arm64-st1.ll @@ -16,9 +16,15 @@ define void @st1lane_16b(<16 x i8> %A, ptr %D) { } define void @st1lane0_16b(<16 x i8> %A, ptr %D) { -; CHECK-LABEL: st1lane0_16b: -; CHECK: add x8, x0, #1 -; CHECK: st1.b { v0 }[0], [x8] +; SD-CHECK-LABEL: st1lane0_16b: +; SD-CHECK: str b0, [x0, #1] +; +; GI-CHECK-LABEL: st1lane0_16b: +; GI-CHECK: add x8, x0, #1 +; GI-CHECK: st1.b { v0 }[0], [x8] +; +; EXYNOS-LABEL: st1lane0_16b: +; EXYNOS: str b0, [x0, #1] %ptr = getelementptr i8, ptr %D, i64 1 %tmp = extractelement <16 x i8> %A, i32 0 @@ -27,10 +33,15 @@ define void @st1lane0_16b(<16 x i8> %A, ptr %D) { } define void @st1lane0u_16b(<16 x i8> %A, ptr %D) { -; CHECK-LABEL: st1lane0u_16b: -; CHECK: sub x8, x0, #1 -; CHECK: st1.b { v0 }[0], [x8] - +; SD-CHECK-LABEL: st1lane0u_16b: +; SD-CHECK: stur b0, [x0, #-1] +; +; GI-CHECK-LABEL: st1lane0u_16b: +; GI-CHECK: sub x8, x0, #1 +; GI-CHECK: st1.b { v0 }[0], [x8] +; +; EXYNOS-LABEL: st1lane0u_16b: +; EXYNOS: stur b0, [x0, #-1] %ptr = getelementptr i8, ptr %D, i64 -1 %tmp = extractelement <16 x i8> %A, i32 0 store i8 %tmp, ptr %ptr @@ -49,10 +60,15 @@ define void @st1lane_ro_16b(<16 x i8> %A, ptr %D, i64 %offset) { } define void @st1lane0_ro_16b(<16 x i8> %A, ptr %D, i64 %offset) { -; CHECK-LABEL: st1lane0_ro_16b: -; CHECK: add x8, x0, x1 -; CHECK: st1.b { v0 }[0], [x8] - +; SD-CHECK-LABEL: st1lane0_ro_16b: +; SD-CHECK: str b0, [x0, x1] +; +; GI-CHECK-LABEL: st1lane0_ro_16b: +; GI-CHECK: add x8, x0, x1 +; GI-CHECK: st1.b { v0 }[0], [x8] +; +; EXYNOS-LABEL: st1lane0_ro_16b: +; EXYNOS: str b0, [x0, x1] %ptr = getelementptr i8, ptr %D, i64 %offset %tmp = extractelement <16 x i8> %A, i32 0 store i8 %tmp, ptr %ptr @@ -317,10 +333,15 @@ define void @st1lane_ro_8b(<8 x i8> %A, ptr %D, i64 %offset) { } define void @st1lane0_ro_8b(<8 x i8> %A, ptr %D, i64 %offset) { -; CHECK-LABEL: st1lane0_ro_8b: -; CHECK: add x8, x0, x1 -; CHECK: st1.b { v0 }[0], [x8] - +; SD-CHECK-LABEL: st1lane0_ro_8b: +; SD-CHECK: str b0, [x0, x1] +; +; GI-CHECK-LABEL: st1lane0_ro_8b: +; GI-CHECK: add x8, x0, x1 +; GI-CHECK: st1.b { v0 }[0], [x8] +; +; EXYNOS-LABEL: st1lane0_ro_8b: +; EXYNOS: str b0, [x0, x1] %ptr = getelementptr i8, ptr %D, i64 %offset %tmp = extractelement <8 x i8> %A, i32 0 store i8 %tmp, ptr %ptr diff --git a/llvm/test/CodeGen/AArch64/bitcast-v2i8.ll b/llvm/test/CodeGen/AArch64/bitcast-v2i8.ll index aff3ffc70a711..77304aef4385e 100644 --- a/llvm/test/CodeGen/AArch64/bitcast-v2i8.ll +++ b/llvm/test/CodeGen/AArch64/bitcast-v2i8.ll @@ -5,9 +5,8 @@ define i16 @test_bitcast_v2i8_to_i16(<2 x i8> %a) { ; CHECK-LABEL: test_bitcast_v2i8_to_i16 ; CHECK: mov.s [[WREG_HI:w[0-9]+]], v0[1] -; CHECK-NEXT: fmov [[WREG_LO:w[0-9]+]], s0 ; CHECK-NEXT: strb [[WREG_HI]], [sp, #15] -; CHECK-NEXT: strb [[WREG_LO]], [sp, #14] +; CHECK-NEXT: str [[WREG_LO:b[0-9]+]], [sp, #14] ; CHECK-NEXT: ldrh w0, [sp, #14] %aa = bitcast <2 x i8> %a to i16 diff --git a/llvm/test/CodeGen/AArch64/ctlz.ll b/llvm/test/CodeGen/AArch64/ctlz.ll index 742433c50d390..79676efebe776 100644 --- a/llvm/test/CodeGen/AArch64/ctlz.ll +++ b/llvm/test/CodeGen/AArch64/ctlz.ll @@ -14,8 +14,7 @@ define void @v2i8(ptr %p1) { ; CHECK-SD-NEXT: clz v1.2s, v1.2s ; CHECK-SD-NEXT: sub v0.2s, v1.2s, v0.2s ; CHECK-SD-NEXT: mov w8, v0.s[1] -; CHECK-SD-NEXT: fmov w9, s0 -; CHECK-SD-NEXT: strb w9, [x0] +; CHECK-SD-NEXT: str b0, [x0] ; CHECK-SD-NEXT: strb w8, [x0, #1] ; CHECK-SD-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/ctpop.ll b/llvm/test/CodeGen/AArch64/ctpop.ll index c7c378d3e67cd..767b9d28d6215 100644 --- a/llvm/test/CodeGen/AArch64/ctpop.ll +++ b/llvm/test/CodeGen/AArch64/ctpop.ll @@ -14,8 +14,7 @@ define void @v2i8(ptr %p1) { ; CHECK-SD-NEXT: uaddlp v0.4h, v0.8b ; CHECK-SD-NEXT: uaddlp v0.2s, v0.4h ; CHECK-SD-NEXT: mov w8, v0.s[1] -; CHECK-SD-NEXT: fmov w9, s0 -; CHECK-SD-NEXT: strb w9, [x0] +; CHECK-SD-NEXT: str b0, [x0] ; CHECK-SD-NEXT: strb w8, [x0, #1] ; CHECK-SD-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/cttz.ll b/llvm/test/CodeGen/AArch64/cttz.ll index 41843e03cb81e..97f5a29064c67 100644 --- a/llvm/test/CodeGen/AArch64/cttz.ll +++ b/llvm/test/CodeGen/AArch64/cttz.ll @@ -17,8 +17,7 @@ define void @v2i8(ptr %p1) { ; CHECK-SD-NEXT: clz v0.2s, v0.2s ; CHECK-SD-NEXT: sub v0.2s, v1.2s, v0.2s ; CHECK-SD-NEXT: mov w8, v0.s[1] -; CHECK-SD-NEXT: fmov w9, s0 -; CHECK-SD-NEXT: strb w9, [x0] +; CHECK-SD-NEXT: str b0, [x0] ; CHECK-SD-NEXT: strb w8, [x0, #1] ; CHECK-SD-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/extract-vector-cmp.ll b/llvm/test/CodeGen/AArch64/extract-vector-cmp.ll index 8345fdfa46b4c..f076ee12427d8 100644 --- a/llvm/test/CodeGen/AArch64/extract-vector-cmp.ll +++ b/llvm/test/CodeGen/AArch64/extract-vector-cmp.ll @@ -184,17 +184,16 @@ define i1 @extract_icmp_v4i32_splat_rhs_mul_use(<4 x i32> %a, ptr %p) { ; CHECK-LABEL: extract_icmp_v4i32_splat_rhs_mul_use: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v1.4s, #235 -; CHECK-NEXT: adrp x9, .LCPI8_0 +; CHECK-NEXT: adrp x8, .LCPI8_0 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI8_0] ; CHECK-NEXT: mov x8, x0 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI8_0] ; CHECK-NEXT: cmhi v0.4s, v1.4s, v0.4s ; CHECK-NEXT: xtn v1.4h, v0.4s ; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: umov w9, v1.h[1] -; CHECK-NEXT: fmov w10, s0 +; CHECK-NEXT: str b0, [x8] ; CHECK-NEXT: and w0, w9, #0x1 -; CHECK-NEXT: strb w10, [x8] ; CHECK-NEXT: ret %icmp = icmp ult <4 x i32> %a, splat(i32 235) %ext = extractelement <4 x i1> %icmp, i32 1 diff --git a/llvm/test/CodeGen/AArch64/mul.ll b/llvm/test/CodeGen/AArch64/mul.ll index 500379d1cfdec..0d7a6a7dbcb11 100644 --- a/llvm/test/CodeGen/AArch64/mul.ll +++ b/llvm/test/CodeGen/AArch64/mul.ll @@ -76,8 +76,7 @@ define void @v2i8(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9] ; CHECK-SD-NEXT: mul v0.2s, v0.2s, v1.2s ; CHECK-SD-NEXT: mov w8, v0.s[1] -; CHECK-SD-NEXT: fmov w9, s0 -; CHECK-SD-NEXT: strb w9, [x0] +; CHECK-SD-NEXT: str b0, [x0] ; CHECK-SD-NEXT: strb w8, [x0, #1] ; CHECK-SD-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/neon-truncstore.ll b/llvm/test/CodeGen/AArch64/neon-truncstore.ll index 3d3362d314a99..a070e3d7565ed 100644 --- a/llvm/test/CodeGen/AArch64/neon-truncstore.ll +++ b/llvm/test/CodeGen/AArch64/neon-truncstore.ll @@ -90,8 +90,7 @@ define void @v2i32_v2i8(<2 x i32> %a, ptr %result) { ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: strb w9, [x0] +; CHECK-NEXT: str b0, [x0] ; CHECK-NEXT: strb w8, [x0, #1] ; CHECK-NEXT: ret %b = trunc <2 x i32> %a to <2 x i8> @@ -157,8 +156,7 @@ define void @v2i16_v2i8(<2 x i16> %a, ptr %result) { ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: strb w9, [x0] +; CHECK-NEXT: str b0, [x0] ; CHECK-NEXT: strb w8, [x0, #1] ; CHECK-NEXT: ret %b = trunc <2 x i16> %a to <2 x i8> diff --git a/llvm/test/CodeGen/AArch64/nontemporal-load.ll b/llvm/test/CodeGen/AArch64/nontemporal-load.ll index 959ac7f68e351..28cff55beff9e 100644 --- a/llvm/test/CodeGen/AArch64/nontemporal-load.ll +++ b/llvm/test/CodeGen/AArch64/nontemporal-load.ll @@ -449,10 +449,9 @@ define <33 x i8> @test_ldnp_v33i8(ptr %A) { ; CHECK-LABEL: test_ldnp_v33i8: ; CHECK: ; %bb.0: ; CHECK-NEXT: ldnp q0, q1, [x0] -; CHECK-NEXT: add x9, x8, #32 ; CHECK-NEXT: ldr b2, [x0, #32] ; CHECK-NEXT: stp q0, q1, [x8] -; CHECK-NEXT: st1.b { v2 }[0], [x9] +; CHECK-NEXT: str b2, [x8, #32] ; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_ldnp_v33i8: diff --git a/llvm/test/CodeGen/AArch64/pr-cf624b2.ll b/llvm/test/CodeGen/AArch64/pr-cf624b2.ll index ea9588e9e3db7..0b0540e559abd 100644 --- a/llvm/test/CodeGen/AArch64/pr-cf624b2.ll +++ b/llvm/test/CodeGen/AArch64/pr-cf624b2.ll @@ -14,9 +14,9 @@ define linkonce_odr void @_ZN1y2beEPiRK1vPmPS1_(<8 x i8> %0, ptr %agg.tmp.i) { ; CHECK-NEXT: mov x8, sp ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: str b0, [sp] ; CHECK-NEXT: orr x9, x8, #0xf ; CHECK-NEXT: orr x10, x8, #0xe -; CHECK-NEXT: st1 { v0.b }[0], [x8] ; CHECK-NEXT: st1 { v0.b }[15], [x9] ; CHECK-NEXT: orr x9, x8, #0xc ; CHECK-NEXT: st1 { v0.b }[12], [x9] @@ -46,9 +46,9 @@ define linkonce_odr void @_ZN1y2beEPiRK1vPmPS1_(<8 x i8> %0, ptr %agg.tmp.i) { ; CHECK-NEXT: mov w10, #9 // =0x9 ; CHECK-NEXT: st1 { v0.b }[10], [x9] ; CHECK-NEXT: orr x9, x8, x10 +; CHECK-NEXT: mov w10, #5 // =0x5 +; CHECK-NEXT: orr x8, x8, x10 ; CHECK-NEXT: st1 { v0.b }[9], [x9] -; CHECK-NEXT: mov w9, #5 // =0x5 -; CHECK-NEXT: orr x8, x8, x9 ; CHECK-NEXT: st1 { v0.b }[5], [x8] ; CHECK-NEXT: ldr q0, [sp] ; CHECK-NEXT: stp q0, q1, [x0] diff --git a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll index 4d76994be204f..cbb3b06030bae 100644 --- a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll @@ -201,8 +201,7 @@ define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-SD-NEXT: sqadd v0.2s, v0.2s, v1.2s ; CHECK-SD-NEXT: ushr v0.2s, v0.2s, #24 ; CHECK-SD-NEXT: mov w8, v0.s[1] -; CHECK-SD-NEXT: fmov w9, s0 -; CHECK-SD-NEXT: strb w9, [x2] +; CHECK-SD-NEXT: str b0, [x2] ; CHECK-SD-NEXT: strb w8, [x2, #1] ; CHECK-SD-NEXT: ret ; @@ -325,7 +324,7 @@ define void @v1i8(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-SD-NEXT: ldr b0, [x0] ; CHECK-SD-NEXT: ldr b1, [x1] ; CHECK-SD-NEXT: sqadd v0.8b, v0.8b, v1.8b -; CHECK-SD-NEXT: st1 { v0.b }[0], [x2] +; CHECK-SD-NEXT: str b0, [x2] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: v1i8: diff --git a/llvm/test/CodeGen/AArch64/setcc-type-mismatch.ll b/llvm/test/CodeGen/AArch64/setcc-type-mismatch.ll index c0a728014e390..950ac92a8b12f 100644 --- a/llvm/test/CodeGen/AArch64/setcc-type-mismatch.ll +++ b/llvm/test/CodeGen/AArch64/setcc-type-mismatch.ll @@ -12,8 +12,7 @@ define void @test_mismatched_setcc(<4 x i22> %l, <4 x i22> %r, ptr %addr) { ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_0] ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: addv s0, v0.4s -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: strb w8, [x0] +; CHECK-NEXT: str b0, [x0] ; CHECK-NEXT: ret %tst = icmp eq <4 x i22> %l, %r diff --git a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll index ae2a16929e254..04b379f455008 100644 --- a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll @@ -202,8 +202,7 @@ define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-SD-NEXT: sqsub v0.2s, v0.2s, v1.2s ; CHECK-SD-NEXT: ushr v0.2s, v0.2s, #24 ; CHECK-SD-NEXT: mov w8, v0.s[1] -; CHECK-SD-NEXT: fmov w9, s0 -; CHECK-SD-NEXT: strb w9, [x2] +; CHECK-SD-NEXT: str b0, [x2] ; CHECK-SD-NEXT: strb w8, [x2, #1] ; CHECK-SD-NEXT: ret ; @@ -326,7 +325,7 @@ define void @v1i8(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-SD-NEXT: ldr b0, [x0] ; CHECK-SD-NEXT: ldr b1, [x1] ; CHECK-SD-NEXT: sqsub v0.8b, v0.8b, v1.8b -; CHECK-SD-NEXT: st1 { v0.b }[0], [x2] +; CHECK-SD-NEXT: str b0, [x2] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: v1i8: diff --git a/llvm/test/CodeGen/AArch64/store.ll b/llvm/test/CodeGen/AArch64/store.ll index 37a6ad08d4cb3..7ea957d9d165d 100644 --- a/llvm/test/CodeGen/AArch64/store.ll +++ b/llvm/test/CodeGen/AArch64/store.ll @@ -111,8 +111,7 @@ define void @store_v2i8(<2 x i8> %a, ptr %ptr){ ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-SD-NEXT: mov w8, v0.s[1] -; CHECK-SD-NEXT: fmov w9, s0 -; CHECK-SD-NEXT: strb w9, [x0] +; CHECK-SD-NEXT: str b0, [x0] ; CHECK-SD-NEXT: strb w8, [x0, #1] ; CHECK-SD-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/sub.ll b/llvm/test/CodeGen/AArch64/sub.ll index 8183a82f21cb5..91a17a89af6e1 100644 --- a/llvm/test/CodeGen/AArch64/sub.ll +++ b/llvm/test/CodeGen/AArch64/sub.ll @@ -64,8 +64,7 @@ define void @v2i8(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9] ; CHECK-SD-NEXT: sub v0.2s, v0.2s, v1.2s ; CHECK-SD-NEXT: mov w8, v0.s[1] -; CHECK-SD-NEXT: fmov w9, s0 -; CHECK-SD-NEXT: strb w9, [x0] +; CHECK-SD-NEXT: str b0, [x0] ; CHECK-SD-NEXT: strb w8, [x0, #1] ; CHECK-SD-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll index aa1adfd306a4c..89a06bc9d5b4e 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll @@ -73,8 +73,7 @@ define void @alloc_v6i8(ptr %st_ptr) nounwind { ; CHECK-NEXT: zip1 z1.s, z1.s, z0.s ; CHECK-NEXT: st1b { z1.h }, p0, [x8] ; CHECK-NEXT: ld1h { z1.s }, p1/z, [x8] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: strb w8, [x19, #2] +; CHECK-NEXT: str b0, [x19, #2] ; CHECK-NEXT: str h1, [x19] ; CHECK-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #32 @@ -119,11 +118,11 @@ define void @alloc_v32i8(ptr %st_ptr) nounwind { ; CHECK-NEXT: mov x0, sp ; CHECK-NEXT: bl def ; CHECK-NEXT: adrp x8, .LCPI2_0 -; CHECK-NEXT: ldp q0, q2, [sp] +; CHECK-NEXT: ldr q0, [sp] ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_0] ; CHECK-NEXT: tbl z0.b, { z0.b }, z1.b -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: strb w8, [x19, #8] +; CHECK-NEXT: ldr q1, [sp, #16] +; CHECK-NEXT: str b1, [x19, #8] ; CHECK-NEXT: str d0, [x19] ; CHECK-NEXT: ldp x30, x19, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #48 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll index d9f8482a3c503..b1ac9469c0573 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll @@ -20,9 +20,8 @@ define <2 x i64> @masked_gather_v2i64(ptr %a, ptr %b) vscale_range(2, 2) { ; CHECK-NEXT: ldr q1, [x1] ; CHECK-NEXT: uaddv d0, p0, z0.d ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: fmov x8, d0 -; CHECK-NEXT: strb w8, [sp, #12] -; CHECK-NEXT: and w8, w8, #0xff +; CHECK-NEXT: str b0, [sp, #12] +; CHECK-NEXT: ldrb w8, [sp, #12] ; CHECK-NEXT: tbz w8, #0, .LBB0_2 ; CHECK-NEXT: // %bb.1: // %cond.load ; CHECK-NEXT: fmov x9, d1 @@ -109,11 +108,10 @@ define void @masked_scatter_v2i64(ptr %a, ptr %b) vscale_range(2, 2) { ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 ; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: and z1.d, z2.d, z1.d -; CHECK-NEXT: uaddv d1, p0, z1.d -; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: uaddv d2, p0, z1.d ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: strb w8, [sp, #12] -; CHECK-NEXT: and w8, w8, #0xff +; CHECK-NEXT: str b2, [sp, #12] +; CHECK-NEXT: ldrb w8, [sp, #12] ; CHECK-NEXT: tbnz w8, #0, .LBB1_3 ; CHECK-NEXT: // %bb.1: // %else ; CHECK-NEXT: tbnz w8, #1, .LBB1_4 diff --git a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll index d0173307bd830..edd96ae4836a4 100644 --- a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll @@ -199,8 +199,7 @@ define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-SD-NEXT: add v0.2s, v0.2s, v1.2s ; CHECK-SD-NEXT: umin v0.2s, v0.2s, v2.2s ; CHECK-SD-NEXT: mov w8, v0.s[1] -; CHECK-SD-NEXT: fmov w9, s0 -; CHECK-SD-NEXT: strb w9, [x2] +; CHECK-SD-NEXT: str b0, [x2] ; CHECK-SD-NEXT: strb w8, [x2, #1] ; CHECK-SD-NEXT: ret ; @@ -324,7 +323,7 @@ define void @v1i8(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-SD-NEXT: ldr b0, [x0] ; CHECK-SD-NEXT: ldr b1, [x1] ; CHECK-SD-NEXT: uqadd v0.8b, v0.8b, v1.8b -; CHECK-SD-NEXT: st1 { v0.b }[0], [x2] +; CHECK-SD-NEXT: str b0, [x2] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: v1i8: diff --git a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll index dc3ebfb0682ca..63ca1b51c2291 100644 --- a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll @@ -198,8 +198,7 @@ define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-SD-NEXT: mov v1.s[1], w11 ; CHECK-SD-NEXT: uqsub v0.2s, v0.2s, v1.2s ; CHECK-SD-NEXT: mov w8, v0.s[1] -; CHECK-SD-NEXT: fmov w9, s0 -; CHECK-SD-NEXT: strb w9, [x2] +; CHECK-SD-NEXT: str b0, [x2] ; CHECK-SD-NEXT: strb w8, [x2, #1] ; CHECK-SD-NEXT: ret ; @@ -321,7 +320,7 @@ define void @v1i8(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-SD-NEXT: ldr b0, [x0] ; CHECK-SD-NEXT: ldr b1, [x1] ; CHECK-SD-NEXT: uqsub v0.8b, v0.8b, v1.8b -; CHECK-SD-NEXT: st1 { v0.b }[0], [x2] +; CHECK-SD-NEXT: str b0, [x2] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: v1i8: diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll index dd7a9c6d7768b..d9b5a42ba98a6 100644 --- a/llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll +++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll @@ -56,8 +56,7 @@ define void @store_4_elements(<4 x i32> %vec, ptr %out) { ; CHECK-NEXT: ldr q1, [x8, lCPI2_0@PAGEOFF] ; CHECK-NEXT: bic.16b v0, v1, v0 ; CHECK-NEXT: addv.4s s0, v0 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: strb w8, [x0] +; CHECK-NEXT: str b0, [x0] ; CHECK-NEXT: ret ; CHECK-NEXT: .loh AdrpLdr Lloh4, Lloh5 @@ -99,8 +98,7 @@ define void @add_trunc_compare_before_store(<4 x i32> %vec, ptr %out) { ; CHECK-NEXT: cmlt.4s v0, v0, #0 ; CHECK-NEXT: and.16b v0, v0, v1 ; CHECK-NEXT: addv.4s s0, v0 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: strb w8, [x0] +; CHECK-NEXT: str b0, [x0] ; CHECK-NEXT: ret ; CHECK-NEXT: .loh AdrpLdr Lloh8, Lloh9 @@ -141,7 +139,7 @@ define void @store_8_elements_64_bit_vector(<8 x i8> %vec, ptr %out) { ; CHECK-NEXT: ldr d1, [x8, lCPI6_0@PAGEOFF] ; CHECK-NEXT: bic.8b v0, v1, v0 ; CHECK-NEXT: addv.8b b0, v0 -; CHECK-NEXT: st1.b { v0 }[0], [x0] +; CHECK-NEXT: str b0, [x0] ; CHECK-NEXT: ret ; CHECK-NEXT: .loh AdrpLdr Lloh12, Lloh13 @@ -182,8 +180,7 @@ define void @store_2_elements_64_bit_vector(<2 x i32> %vec, ptr %out) { ; CHECK-NEXT: ldr d1, [x8, lCPI8_0@PAGEOFF] ; CHECK-NEXT: bic.8b v0, v1, v0 ; CHECK-NEXT: addp.2s v0, v0, v0 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: strb w8, [x0] +; CHECK-NEXT: str b0, [x0] ; CHECK-NEXT: ret ; CHECK-NEXT: .loh AdrpLdr Lloh16, Lloh17 diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll index 45b7a2759b0b3..4aa7fa8b22b4f 100644 --- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll +++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll @@ -403,7 +403,7 @@ define void @store_trunc_add_from_64bits(ptr %src, ptr %dst) { ; CHECK-NEXT: add.4h v0, v0, v1 ; CHECK-NEXT: st1.b { v0 }[2], [x8] ; CHECK-NEXT: st1.b { v0 }[4], [x9] -; CHECK-NEXT: st1.b { v0 }[0], [x1] +; CHECK-NEXT: str b0, [x1] ; CHECK-NEXT: ret ; CHECK-NEXT: .loh AdrpLdr Lloh0, Lloh1 ; @@ -592,7 +592,7 @@ define void @shift_trunc_store(ptr %src, ptr %dst) { ; CHECK-NEXT: ushr.4s v0, v0, #16 ; CHECK-NEXT: st1.b { v0 }[4], [x8] ; CHECK-NEXT: st1.b { v0 }[8], [x9] -; CHECK-NEXT: st1.b { v0 }[0], [x1] +; CHECK-NEXT: str b0, [x1] ; CHECK-NEXT: ret ; ; BE-LABEL: shift_trunc_store: @@ -626,7 +626,7 @@ define void @shift_trunc_store_default_align(ptr %src, ptr %dst) { ; CHECK-NEXT: ushr.4s v0, v0, #16 ; CHECK-NEXT: st1.b { v0 }[4], [x8] ; CHECK-NEXT: st1.b { v0 }[8], [x9] -; CHECK-NEXT: st1.b { v0 }[0], [x1] +; CHECK-NEXT: str b0, [x1] ; CHECK-NEXT: ret ; ; BE-LABEL: shift_trunc_store_default_align: @@ -660,7 +660,7 @@ define void @shift_trunc_store_align_4(ptr %src, ptr %dst) { ; CHECK-NEXT: ushr.4s v0, v0, #16 ; CHECK-NEXT: st1.b { v0 }[4], [x8] ; CHECK-NEXT: st1.b { v0 }[8], [x9] -; CHECK-NEXT: st1.b { v0 }[0], [x1] +; CHECK-NEXT: str b0, [x1] ; CHECK-NEXT: ret ; ; BE-LABEL: shift_trunc_store_align_4: @@ -693,9 +693,8 @@ define void @shift_trunc_store_const_offset_1(ptr %src, ptr %dst) { ; CHECK-NEXT: add x9, x1, #3 ; CHECK-NEXT: ushr.4s v0, v0, #16 ; CHECK-NEXT: st1.b { v0 }[4], [x8] -; CHECK-NEXT: add x8, x1, #1 ; CHECK-NEXT: st1.b { v0 }[8], [x9] -; CHECK-NEXT: st1.b { v0 }[0], [x8] +; CHECK-NEXT: str b0, [x1, #1] ; CHECK-NEXT: ret ; ; BE-LABEL: shift_trunc_store_const_offset_1: @@ -729,9 +728,8 @@ define void @shift_trunc_store_const_offset_3(ptr %src, ptr %dst) { ; CHECK-NEXT: add x9, x1, #5 ; CHECK-NEXT: ushr.4s v0, v0, #16 ; CHECK-NEXT: st1.b { v0 }[4], [x8] -; CHECK-NEXT: add x8, x1, #3 ; CHECK-NEXT: st1.b { v0 }[8], [x9] -; CHECK-NEXT: st1.b { v0 }[0], [x8] +; CHECK-NEXT: str b0, [x1, #3] ; CHECK-NEXT: ret ; ; BE-LABEL: shift_trunc_store_const_offset_3: @@ -807,12 +805,12 @@ define void @load_v3i8_zext_to_3xi32_add_trunc_store(ptr %src) { ; CHECK-NEXT: add x8, x0, #2 ; CHECK-NEXT: orr w9, w10, w9, lsl #16 ; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: add x9, x0, #1 ; CHECK-NEXT: zip1.8b v0, v0, v0 ; CHECK-NEXT: uaddw.4s v0, v1, v0 ; CHECK-NEXT: st1.b { v0 }[8], [x8] -; CHECK-NEXT: add x8, x0, #1 -; CHECK-NEXT: st1.b { v0 }[0], [x0] -; CHECK-NEXT: st1.b { v0 }[4], [x8] +; CHECK-NEXT: st1.b { v0 }[4], [x9] +; CHECK-NEXT: str b0, [x0] ; CHECK-NEXT: ret ; CHECK-NEXT: .loh AdrpLdr Lloh4, Lloh5 ; @@ -860,12 +858,12 @@ define void @load_v3i8_sext_to_3xi32_add_trunc_store(ptr %src) { ; CHECK-NEXT: add x8, x0, #2 ; CHECK-NEXT: orr w9, w10, w9, lsl #16 ; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: add x9, x0, #1 ; CHECK-NEXT: zip1.8b v0, v0, v0 ; CHECK-NEXT: uaddw.4s v0, v1, v0 ; CHECK-NEXT: st1.b { v0 }[8], [x8] -; CHECK-NEXT: add x8, x0, #1 -; CHECK-NEXT: st1.b { v0 }[0], [x0] -; CHECK-NEXT: st1.b { v0 }[4], [x8] +; CHECK-NEXT: st1.b { v0 }[4], [x9] +; CHECK-NEXT: str b0, [x0] ; CHECK-NEXT: ret ; CHECK-NEXT: .loh AdrpLdr Lloh6, Lloh7 ; diff --git a/llvm/test/CodeGen/AArch64/vector-compress.ll b/llvm/test/CodeGen/AArch64/vector-compress.ll index 710ea70d678c5..f990bdc2e5615 100644 --- a/llvm/test/CodeGen/AArch64/vector-compress.ll +++ b/llvm/test/CodeGen/AArch64/vector-compress.ll @@ -109,7 +109,7 @@ define <16 x i8> @test_compress_v16i8(<16 x i8> %vec, <16 x i1> %mask) { ; CHECK-NEXT: shl.16b v1, v1, #7 ; CHECK-NEXT: mov x12, sp ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: st1.b { v0 }[0], [x8] +; CHECK-NEXT: str b0, [sp] ; CHECK-NEXT: mov x13, sp ; CHECK-NEXT: cmlt.16b v1, v1, #0 ; CHECK-NEXT: umov.b w9, v1[0] From bc61bd5a049d5d9eb997c5753e5af77ba26d93c2 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Wed, 2 Apr 2025 17:01:02 +0000 Subject: [PATCH 02/12] WIP: Attempt vector truncstore --- .../Target/AArch64/AArch64ISelLowering.cpp | 40 ++++- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 20 ++- .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 39 +--- .../CodeGen/AArch64/aarch64-sve-ldst-one.ll | 166 ++++++++++++------ llvm/test/CodeGen/AArch64/add.ll | 12 +- llvm/test/CodeGen/AArch64/andorxor.ll | 36 ++-- .../test/CodeGen/AArch64/arm64-collect-loh.ll | 1 - .../AArch64/arm64-neon-simd-ldst-one.ll | 31 +++- llvm/test/CodeGen/AArch64/arm64-st1.ll | 64 +++++-- llvm/test/CodeGen/AArch64/bitcast-v2i8.ll | 17 +- llvm/test/CodeGen/AArch64/ctlz.ll | 12 +- llvm/test/CodeGen/AArch64/ctpop.ll | 12 +- llvm/test/CodeGen/AArch64/cttz.ll | 12 +- llvm/test/CodeGen/AArch64/mul.ll | 12 +- llvm/test/CodeGen/AArch64/neon-truncstore.ll | 8 +- llvm/test/CodeGen/AArch64/nontemporal-load.ll | 2 +- llvm/test/CodeGen/AArch64/pr-cf624b2.ll | 60 +++---- llvm/test/CodeGen/AArch64/sadd_sat_vec.ll | 4 +- llvm/test/CodeGen/AArch64/ssub_sat_vec.ll | 4 +- llvm/test/CodeGen/AArch64/store.ll | 12 +- llvm/test/CodeGen/AArch64/sub.ll | 12 +- ...-streaming-mode-fixed-length-ld2-alloca.ll | 4 +- llvm/test/CodeGen/AArch64/trunc-to-tbl.ll | 27 +-- llvm/test/CodeGen/AArch64/uadd_sat_vec.ll | 4 +- llvm/test/CodeGen/AArch64/usub_sat_vec.ll | 4 +- .../vec-combine-compare-truncate-store.ll | 12 +- .../AArch64/vec3-loads-ext-trunc-stores.ll | 156 ++++++++-------- llvm/test/CodeGen/AArch64/vec_uaddo.ll | 5 +- llvm/test/CodeGen/AArch64/vec_umulo.ll | 7 +- llvm/test/CodeGen/AArch64/vector-compress.ll | 155 ++++++++-------- llvm/test/CodeGen/AArch64/zext-to-tbl.ll | 29 +-- 31 files changed, 549 insertions(+), 430 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 4505022b884ca..186bd484132b8 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1394,6 +1394,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, } } + setTruncStoreAction(MVT::v1i64, MVT::v1i8, Legal); + for (auto Op : {ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL, ISD::FRINT, ISD::FTRUNC, ISD::FROUND, ISD::FROUNDEVEN, ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE, @@ -23944,6 +23946,22 @@ static unsigned getFPSubregForVT(EVT VT) { } } +static EVT get64BitVector(EVT ElVT) { + assert(ElVT.isSimple() && "Expected simple VT"); + switch (ElVT.getSimpleVT().SimpleTy) { + case MVT::i8: + return MVT::v8i8; + case MVT::i16: + return MVT::v4i16; + case MVT::i32: + return MVT::v2i32; + case MVT::i64: + return MVT::v1i64; + default: + llvm_unreachable("Unexpected VT!"); + } +} + static SDValue performSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, @@ -24022,11 +24040,27 @@ static SDValue performSTORECombine(SDNode *N, SDValue ExtIdx = Value.getOperand(1); EVT VectorVT = Vector.getValueType(); EVT ElemVT = VectorVT.getVectorElementType(); - if (!ValueVT.isInteger() || ElemVT == MVT::i8 || MemVT == MVT::i8) + if (!ValueVT.isInteger()) return SDValue(); if (ValueVT != MemVT && !ST->isTruncatingStore()) return SDValue(); + if (MemVT == MVT::i8) { + SDValue Zero = DAG.getConstant(0, DL, MVT::i64); + SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, + Value.getValueType(), Vector, ExtIdx); + EVT VecVT64 = get64BitVector(ElemVT); + SDValue ExtVector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT64, + DAG.getUNDEF(VecVT64), Ext, Zero); + SDValue Cast = DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v1i64, ExtVector); + return DAG.getTruncStore(ST->getChain(), DL, Cast, ST->getBasePtr(), + MVT::v1i8, ST->getMemOperand()); + } + + // TODO: Handle storing i8s to wider types. + if (ElemVT == MVT::i8) + return SDValue(); + // Heuristic: If there are other users of integer scalars extracted from // this vector that won't fold into the store -- abandon folding. Applying // this fold may extend the vector lifetime and disrupt paired stores. @@ -28809,6 +28843,10 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE( auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT); auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue()); + // Can be lowered to a bsub store in ISEL. + if (VT == MVT::v1i64 && MemVT == MVT::v1i8) + return SDValue(); + if (VT.isFloatingPoint() && Store->isTruncatingStore()) { EVT TruncVT = ContainerVT.changeVectorElementType( Store->getMemoryVT().getVectorElementType()); diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 2be7d90579654..2f3e765c209de 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -4463,8 +4463,6 @@ multiclass VecStoreLane0Pat; - defm : VecStoreLane0Pat; defm : VecStoreLane0Pat; defm : VecStoreLane0Pat; defm : VecStoreLane0Pat; @@ -4603,6 +4601,18 @@ def : Pat<(truncstorei16 GPR64:$Rt, (am_unscaled16 GPR64sp:$Rn, simm9:$offset)), def : Pat<(truncstorei8 GPR64:$Rt, (am_unscaled8 GPR64sp:$Rn, simm9:$offset)), (STURBBi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>; +// v1i64 -> bsub truncating stores +// Supporting pattern lower f32/64 -> v8i8 +def : Pat<(v8i8 (vector_insert (v8i8 (undef)), (i32 FPR32:$src), 0)), + (INSERT_SUBREG (v8i8 (IMPLICIT_DEF)), FPR32:$src, ssub)>; +def : Pat<(v8i8 (vector_insert (v8i8 (undef)), (i64 FPR64:$src), 0)), + (v8i8 (EXTRACT_SUBREG (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), FPR64:$src, dsub), dsub))>; +// Lower v1i64 -> v1i8 truncstore to bsub store +def : Pat<(truncstorevi8 v1i64:$VT, (am_unscaled8 GPR64sp:$Rn, simm9:$offset)), + (STURBi (vi8 (EXTRACT_SUBREG v1i64:$VT, bsub)), GPR64sp:$Rn, simm9:$offset)>; +def : Pat<(truncstorevi8 v1i64:$VT, (am_indexed8 GPR64sp:$Rn, uimm12s4:$offset)), + (STRBui (vi8 (EXTRACT_SUBREG v1i64:$VT, bsub)), GPR64sp:$Rn, uimm12s4:$offset)>; + // Match stores from lane 0 to the appropriate subreg's store. multiclass VecStoreULane0Pat; defm : VecStoreULane0Pat; defm : VecStoreULane0Pat; defm : VecStoreULane0Pat; @@ -7271,6 +7280,11 @@ multiclass Neon_INS_elt_pattern; + def : Pat<(VT64 (vector_insert (VT64 (undef)), + (VTScal (vector_extract (VT128 V128:$Rn), (i64 0))), + (i64 0))), + (EXTRACT_SUBREG $Rn, dsub)>; + def : Pat<(VT64 (vector_insert V64:$src, (VTScal (vector_extract (VT128 V128:$Rn), (i64 imm:$Immn))), (i64 imm:$Immd))), diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index a8aacf95dc1ff..579a15eeab339 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -1825,43 +1825,6 @@ let Predicates = [HasSVE] in { defm : adrXtwShiftPat; } // End HasSVE -multiclass SVEVecStoreLanePat { - let Predicates = [HasSVE_or_SME] in { - // Same as Neon VecStoreLane0Pat but without matching VecListOne128. - def : Pat<(storeop (STy (vector_extract VTy:$Vt, (i64 0))), - (UIAddrMode GPR64sp:$Rn, IndexType:$offset)), - (STR (SubRegTy (EXTRACT_SUBREG $Vt, SubRegIdx)), - GPR64sp:$Rn, IndexType:$offset)>; - } - - // Non-zero immediate index: - def : Pat<(storeop (STy (vector_extract VTy:$Vt, DUPIdxTy:$idx)), - (UIAddrMode GPR64sp:$Rn, IndexType:$offset)), - (STR (SubRegTy (EXTRACT_SUBREG (DUP $Vt, DUPIdxTy:$idx), SubRegIdx)), - GPR64sp:$Rn, IndexType:$offset)>; -} - -// Note: Types other than i8 are handled in performSTORECombine -- i8 is tricky -// to handle before ISEL as it is not really a legal type in many places, nor -// is its equivalently sized FP variant. -let AddedComplexity = 19 in { - // Lane 0 truncating stores - // i32 -> i8 - defm : SVEVecStoreLanePat; - defm : SVEVecStoreLanePat; - // i64 -> i8 - defm : SVEVecStoreLanePat; - defm : SVEVecStoreLanePat; - // i8 -> i8 (technically a truncate as the extracted type is i32) - defm : SVEVecStoreLanePat; - defm : SVEVecStoreLanePat; -} - let Predicates = [HasSVE_or_SME] in { defm TBL_ZZZ : sve_int_perm_tbl<"tbl", AArch64tbl>; @@ -3243,6 +3206,8 @@ let Predicates = [HasSVE_or_SME] in { // Insert scalar into undef[0] def : Pat<(nxv16i8 (vector_insert (nxv16i8 (undef)), (i32 FPR32:$src), 0)), (INSERT_SUBREG (nxv16i8 (IMPLICIT_DEF)), FPR32:$src, ssub)>; + def : Pat<(nxv16i8 (vector_insert (nxv16i8 (undef)), (i64 FPR64:$src), 0)), + (INSERT_SUBREG (nxv16i8 (IMPLICIT_DEF)), FPR64:$src, dsub)>; def : Pat<(nxv8i16 (vector_insert (nxv8i16 (undef)), (i32 FPR32:$src), 0)), (INSERT_SUBREG (nxv8i16 (IMPLICIT_DEF)), FPR32:$src, ssub)>; def : Pat<(nxv4i32 (vector_insert (nxv4i32 (undef)), (i32 FPR32:$src), 0)), diff --git a/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll index b91cb872a9e0a..598aa69e30fa6 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 | FileCheck %s -; RUN: llc < %s -verify-machineinstrs -mattr=+sme -global-isel=0 -force-streaming | FileCheck %s -; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 -force-streaming-compatible | FileCheck %s +; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 | FileCheck %s --check-prefixes=CHECK,CHECK-NONSTREAMING +; RUN: llc < %s -verify-machineinstrs -mattr=+sme -global-isel=0 -force-streaming | FileCheck %s --check-prefixes=CHECK,STREAMING-COMPAT +; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 -force-streaming-compatible | FileCheck %s --check-prefixes=CHECK,STREAMING-COMPAT target triple = "aarch64-unknown-linux-gnu" @@ -106,12 +106,17 @@ entry: } define void @test_str_lane_s8(ptr %a, %b) { -; CHECK-LABEL: test_str_lane_s8: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.b, z0.b[7] -; CHECK-NEXT: str b0, [x0] -; CHECK-NEXT: ret - +; CHECK-NONSTREAMING-LABEL: test_str_lane_s8: +; CHECK-NONSTREAMING: // %bb.0: // %entry +; CHECK-NONSTREAMING-NEXT: mov v0.b[0], v0.b[7] +; CHECK-NONSTREAMING-NEXT: str b0, [x0] +; CHECK-NONSTREAMING-NEXT: ret +; +; STREAMING-COMPAT-LABEL: test_str_lane_s8: +; STREAMING-COMPAT: // %bb.0: // %entry +; STREAMING-COMPAT-NEXT: mov z0.b, z0.b[7] +; STREAMING-COMPAT-NEXT: str b0, [x0] +; STREAMING-COMPAT-NEXT: ret entry: %0 = extractelement %b, i32 7 store i8 %0, ptr %a, align 1 @@ -119,11 +124,16 @@ entry: } define void @test_str_lane0_s8(ptr %a, %b) { -; CHECK-LABEL: test_str_lane0_s8: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: str b0, [x0] -; CHECK-NEXT: ret - +; CHECK-NONSTREAMING-LABEL: test_str_lane0_s8: +; CHECK-NONSTREAMING: // %bb.0: // %entry +; CHECK-NONSTREAMING-NEXT: mov v0.b[0], v0.b[0] +; CHECK-NONSTREAMING-NEXT: str b0, [x0] +; CHECK-NONSTREAMING-NEXT: ret +; +; STREAMING-COMPAT-LABEL: test_str_lane0_s8: +; STREAMING-COMPAT: // %bb.0: // %entry +; STREAMING-COMPAT-NEXT: str b0, [x0] +; STREAMING-COMPAT-NEXT: ret entry: %0 = extractelement %b, i32 0 store i8 %0, ptr %a, align 1 @@ -194,11 +204,18 @@ define void @test_str_reduction_i32_to_i16(ptr %ptr, %p0, %p0, %v) { -; CHECK-LABEL: test_str_reduction_i32_to_i8: -; CHECK: // %bb.0: -; CHECK-NEXT: uaddv d0, p0, z0.s -; CHECK-NEXT: str b0, [x0] -; CHECK-NEXT: ret +; CHECK-NONSTREAMING-LABEL: test_str_reduction_i32_to_i8: +; CHECK-NONSTREAMING: // %bb.0: +; CHECK-NONSTREAMING-NEXT: uaddv d0, p0, z0.s +; CHECK-NONSTREAMING-NEXT: mov v0.d[0], v0.d[0] +; CHECK-NONSTREAMING-NEXT: str b0, [x0] +; CHECK-NONSTREAMING-NEXT: ret +; +; STREAMING-COMPAT-LABEL: test_str_reduction_i32_to_i8: +; STREAMING-COMPAT: // %bb.0: +; STREAMING-COMPAT-NEXT: uaddv d0, p0, z0.s +; STREAMING-COMPAT-NEXT: str b0, [x0] +; STREAMING-COMPAT-NEXT: ret %reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32( %p0, %v) %trunc = trunc i64 %reduce to i8 @@ -248,11 +265,18 @@ define void @test_str_reduction_i32_to_i16_negative_offset(ptr %ptr, %p0, %v) { -; CHECK-LABEL: test_str_reduction_i32_to_i8_negative_offset: -; CHECK: // %bb.0: -; CHECK-NEXT: uaddv d0, p0, z0.s -; CHECK-NEXT: stur b0, [x0, #-8] -; CHECK-NEXT: ret +; CHECK-NONSTREAMING-LABEL: test_str_reduction_i32_to_i8_negative_offset: +; CHECK-NONSTREAMING: // %bb.0: +; CHECK-NONSTREAMING-NEXT: uaddv d0, p0, z0.s +; CHECK-NONSTREAMING-NEXT: mov v0.d[0], v0.d[0] +; CHECK-NONSTREAMING-NEXT: stur b0, [x0, #-8] +; CHECK-NONSTREAMING-NEXT: ret +; +; STREAMING-COMPAT-LABEL: test_str_reduction_i32_to_i8_negative_offset: +; STREAMING-COMPAT: // %bb.0: +; STREAMING-COMPAT-NEXT: uaddv d0, p0, z0.s +; STREAMING-COMPAT-NEXT: stur b0, [x0, #-8] +; STREAMING-COMPAT-NEXT: ret %reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32( %p0, %v) %trunc = trunc i64 %reduce to i8 @@ -316,12 +340,17 @@ entry: } define void @test_str_lane_s8_negative_offset(ptr %a, %b) { -; CHECK-LABEL: test_str_lane_s8_negative_offset: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.b, z0.b[7] -; CHECK-NEXT: stur b0, [x0, #-8] -; CHECK-NEXT: ret - +; CHECK-NONSTREAMING-LABEL: test_str_lane_s8_negative_offset: +; CHECK-NONSTREAMING: // %bb.0: // %entry +; CHECK-NONSTREAMING-NEXT: mov v0.b[0], v0.b[7] +; CHECK-NONSTREAMING-NEXT: stur b0, [x0, #-8] +; CHECK-NONSTREAMING-NEXT: ret +; +; STREAMING-COMPAT-LABEL: test_str_lane_s8_negative_offset: +; STREAMING-COMPAT: // %bb.0: // %entry +; STREAMING-COMPAT-NEXT: mov z0.b, z0.b[7] +; STREAMING-COMPAT-NEXT: stur b0, [x0, #-8] +; STREAMING-COMPAT-NEXT: ret entry: %0 = extractelement %b, i32 7 %out_ptr = getelementptr inbounds i8, ptr %a, i64 -8 @@ -330,11 +359,16 @@ entry: } define void @test_str_lane0_s8_negative_offset(ptr %a, %b) { -; CHECK-LABEL: test_str_lane0_s8_negative_offset: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: stur b0, [x0, #-8] -; CHECK-NEXT: ret - +; CHECK-NONSTREAMING-LABEL: test_str_lane0_s8_negative_offset: +; CHECK-NONSTREAMING: // %bb.0: // %entry +; CHECK-NONSTREAMING-NEXT: mov v0.b[0], v0.b[0] +; CHECK-NONSTREAMING-NEXT: stur b0, [x0, #-8] +; CHECK-NONSTREAMING-NEXT: ret +; +; STREAMING-COMPAT-LABEL: test_str_lane0_s8_negative_offset: +; STREAMING-COMPAT: // %bb.0: // %entry +; STREAMING-COMPAT-NEXT: stur b0, [x0, #-8] +; STREAMING-COMPAT-NEXT: ret entry: %0 = extractelement %b, i32 0 %out_ptr = getelementptr inbounds i8, ptr %a, i64 -8 @@ -398,12 +432,17 @@ entry: define void @test_str_trunc_lane_s32_to_s8(ptr %a, %b) { -; CHECK-LABEL: test_str_trunc_lane_s32_to_s8: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, z0.s[3] -; CHECK-NEXT: str b0, [x0] -; CHECK-NEXT: ret - +; CHECK-NONSTREAMING-LABEL: test_str_trunc_lane_s32_to_s8: +; CHECK-NONSTREAMING: // %bb.0: // %entry +; CHECK-NONSTREAMING-NEXT: mov v0.s[0], v0.s[3] +; CHECK-NONSTREAMING-NEXT: str b0, [x0] +; CHECK-NONSTREAMING-NEXT: ret +; +; STREAMING-COMPAT-LABEL: test_str_trunc_lane_s32_to_s8: +; STREAMING-COMPAT: // %bb.0: // %entry +; STREAMING-COMPAT-NEXT: mov z0.s, z0.s[3] +; STREAMING-COMPAT-NEXT: str b0, [x0] +; STREAMING-COMPAT-NEXT: ret entry: %0 = extractelement %b, i32 3 %trunc = trunc i32 %0 to i8 @@ -412,10 +451,16 @@ entry: } define void @test_str_trunc_lane0_s32_to_s8(ptr %a, %b) { -; CHECK-LABEL: test_str_trunc_lane0_s32_to_s8: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: str b0, [x0] -; CHECK-NEXT: ret +; CHECK-NONSTREAMING-LABEL: test_str_trunc_lane0_s32_to_s8: +; CHECK-NONSTREAMING: // %bb.0: // %entry +; CHECK-NONSTREAMING-NEXT: mov v0.s[0], v0.s[0] +; CHECK-NONSTREAMING-NEXT: str b0, [x0] +; CHECK-NONSTREAMING-NEXT: ret +; +; STREAMING-COMPAT-LABEL: test_str_trunc_lane0_s32_to_s8: +; STREAMING-COMPAT: // %bb.0: // %entry +; STREAMING-COMPAT-NEXT: str b0, [x0] +; STREAMING-COMPAT-NEXT: ret entry: %0 = extractelement %b, i32 0 @@ -468,12 +513,17 @@ entry: } define void @test_str_trunc_lane_s32_to_s8_negative_offset(ptr %a, %b) { -; CHECK-LABEL: test_str_trunc_lane_s32_to_s8_negative_offset: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, z0.s[3] -; CHECK-NEXT: stur b0, [x0, #-8] -; CHECK-NEXT: ret - +; CHECK-NONSTREAMING-LABEL: test_str_trunc_lane_s32_to_s8_negative_offset: +; CHECK-NONSTREAMING: // %bb.0: // %entry +; CHECK-NONSTREAMING-NEXT: mov v0.s[0], v0.s[3] +; CHECK-NONSTREAMING-NEXT: stur b0, [x0, #-8] +; CHECK-NONSTREAMING-NEXT: ret +; +; STREAMING-COMPAT-LABEL: test_str_trunc_lane_s32_to_s8_negative_offset: +; STREAMING-COMPAT: // %bb.0: // %entry +; STREAMING-COMPAT-NEXT: mov z0.s, z0.s[3] +; STREAMING-COMPAT-NEXT: stur b0, [x0, #-8] +; STREAMING-COMPAT-NEXT: ret entry: %0 = extractelement %b, i32 3 %trunc = trunc i32 %0 to i8 @@ -483,10 +533,16 @@ entry: } define void @test_str_trunc_lane0_s32_to_s8_negative_offset(ptr %a, %b) { -; CHECK-LABEL: test_str_trunc_lane0_s32_to_s8_negative_offset: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: stur b0, [x0, #-8] -; CHECK-NEXT: ret +; CHECK-NONSTREAMING-LABEL: test_str_trunc_lane0_s32_to_s8_negative_offset: +; CHECK-NONSTREAMING: // %bb.0: // %entry +; CHECK-NONSTREAMING-NEXT: mov v0.s[0], v0.s[0] +; CHECK-NONSTREAMING-NEXT: stur b0, [x0, #-8] +; CHECK-NONSTREAMING-NEXT: ret +; +; STREAMING-COMPAT-LABEL: test_str_trunc_lane0_s32_to_s8_negative_offset: +; STREAMING-COMPAT: // %bb.0: // %entry +; STREAMING-COMPAT-NEXT: stur b0, [x0, #-8] +; STREAMING-COMPAT-NEXT: ret entry: %0 = extractelement %b, i32 0 diff --git a/llvm/test/CodeGen/AArch64/add.ll b/llvm/test/CodeGen/AArch64/add.ll index cdde359d09d7b..ea5dbc03ca174 100644 --- a/llvm/test/CodeGen/AArch64/add.ll +++ b/llvm/test/CodeGen/AArch64/add.ll @@ -63,9 +63,9 @@ define void @v2i8(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8] ; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9] ; CHECK-SD-NEXT: add v0.2s, v0.2s, v1.2s -; CHECK-SD-NEXT: mov w8, v0.s[1] +; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] ; CHECK-SD-NEXT: str b0, [x0] -; CHECK-SD-NEXT: strb w8, [x0, #1] +; CHECK-SD-NEXT: stur b1, [x0, #1] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: v2i8: @@ -100,11 +100,11 @@ define void @v3i8(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v0.8b ; CHECK-SD-NEXT: add v0.4h, v0.4h, v1.4h ; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b -; CHECK-SD-NEXT: umov w8, v0.h[2] +; CHECK-SD-NEXT: mov v0.h[0], v0.h[2] ; CHECK-SD-NEXT: str s1, [sp, #12] -; CHECK-SD-NEXT: ldrh w9, [sp, #12] -; CHECK-SD-NEXT: strb w8, [x0, #2] -; CHECK-SD-NEXT: strh w9, [x0] +; CHECK-SD-NEXT: ldrh w8, [sp, #12] +; CHECK-SD-NEXT: stur b0, [x0, #2] +; CHECK-SD-NEXT: strh w8, [x0] ; CHECK-SD-NEXT: add sp, sp, #16 ; CHECK-SD-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/andorxor.ll b/llvm/test/CodeGen/AArch64/andorxor.ll index e11c62ba70de4..709198d830891 100644 --- a/llvm/test/CodeGen/AArch64/andorxor.ll +++ b/llvm/test/CodeGen/AArch64/andorxor.ll @@ -183,9 +183,9 @@ define void @and_v2i8(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8] ; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9] ; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-SD-NEXT: mov w8, v0.s[1] +; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] ; CHECK-SD-NEXT: str b0, [x0] -; CHECK-SD-NEXT: strb w8, [x0, #1] +; CHECK-SD-NEXT: stur b1, [x0, #1] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: and_v2i8: @@ -219,9 +219,9 @@ define void @or_v2i8(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8] ; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9] ; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b -; CHECK-SD-NEXT: mov w8, v0.s[1] +; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] ; CHECK-SD-NEXT: str b0, [x0] -; CHECK-SD-NEXT: strb w8, [x0, #1] +; CHECK-SD-NEXT: stur b1, [x0, #1] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: or_v2i8: @@ -255,9 +255,9 @@ define void @xor_v2i8(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8] ; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9] ; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b -; CHECK-SD-NEXT: mov w8, v0.s[1] +; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] ; CHECK-SD-NEXT: str b0, [x0] -; CHECK-SD-NEXT: strb w8, [x0, #1] +; CHECK-SD-NEXT: stur b1, [x0, #1] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: xor_v2i8: @@ -292,11 +292,11 @@ define void @and_v3i8(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v0.8b ; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b -; CHECK-SD-NEXT: umov w8, v0.h[2] +; CHECK-SD-NEXT: mov v0.h[0], v0.h[2] ; CHECK-SD-NEXT: str s1, [sp, #12] -; CHECK-SD-NEXT: ldrh w9, [sp, #12] -; CHECK-SD-NEXT: strb w8, [x0, #2] -; CHECK-SD-NEXT: strh w9, [x0] +; CHECK-SD-NEXT: ldrh w8, [sp, #12] +; CHECK-SD-NEXT: stur b0, [x0, #2] +; CHECK-SD-NEXT: strh w8, [x0] ; CHECK-SD-NEXT: add sp, sp, #16 ; CHECK-SD-NEXT: ret ; @@ -340,11 +340,11 @@ define void @or_v3i8(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v0.8b ; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b -; CHECK-SD-NEXT: umov w8, v0.h[2] +; CHECK-SD-NEXT: mov v0.h[0], v0.h[2] ; CHECK-SD-NEXT: str s1, [sp, #12] -; CHECK-SD-NEXT: ldrh w9, [sp, #12] -; CHECK-SD-NEXT: strb w8, [x0, #2] -; CHECK-SD-NEXT: strh w9, [x0] +; CHECK-SD-NEXT: ldrh w8, [sp, #12] +; CHECK-SD-NEXT: stur b0, [x0, #2] +; CHECK-SD-NEXT: strh w8, [x0] ; CHECK-SD-NEXT: add sp, sp, #16 ; CHECK-SD-NEXT: ret ; @@ -388,11 +388,11 @@ define void @xor_v3i8(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v0.8b ; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b -; CHECK-SD-NEXT: umov w8, v0.h[2] +; CHECK-SD-NEXT: mov v0.h[0], v0.h[2] ; CHECK-SD-NEXT: str s1, [sp, #12] -; CHECK-SD-NEXT: ldrh w9, [sp, #12] -; CHECK-SD-NEXT: strb w8, [x0, #2] -; CHECK-SD-NEXT: strh w9, [x0] +; CHECK-SD-NEXT: ldrh w8, [sp, #12] +; CHECK-SD-NEXT: stur b0, [x0, #2] +; CHECK-SD-NEXT: strh w8, [x0] ; CHECK-SD-NEXT: add sp, sp, #16 ; CHECK-SD-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll b/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll index 7f2bebf584d8f..246fbbdb80715 100644 --- a/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll +++ b/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll @@ -612,7 +612,6 @@ define <1 x i8> @getL() { ; CHECK-LABEL: _setL ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]: ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _L@GOTPAGE -; CHECK-NEXT: ; kill ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]: ; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], [[[ADRP_REG]], _L@GOTPAGEOFF] ; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]: diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll b/llvm/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll index 7d87be0ce8e1c..2ad567a79d6a5 100644 --- a/llvm/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll @@ -483,10 +483,16 @@ entry: } define void @test_vst1q_lane_s8(ptr %a, <16 x i8> %b) { -; CHECK-LABEL: test_vst1q_lane_s8: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: st1 { v0.b }[15], [x0] -; CHECK-NEXT: ret +; CHECK-GI-LABEL: test_vst1q_lane_s8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov v0.b[0], v0.b[15] +; CHECK-GI-NEXT: str b0, [x0] +; CHECK-GI-NEXT: ret +; +; CHECK-SD-LABEL: test_vst1q_lane_s8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: st1 { v0.b }[15], [x0] +; CHECK-SD-NEXT: ret entry: %0 = extractelement <16 x i8> %b, i32 15 store i8 %0, ptr %a, align 1 @@ -604,11 +610,18 @@ entry: } define void @test_vst1_lane_s8(ptr %a, <8 x i8> %b) { -; CHECK-LABEL: test_vst1_lane_s8: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: st1 { v0.b }[7], [x0] -; CHECK-NEXT: ret +; CHECK-GI-LABEL: test_vst1_lane_s8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: mov v0.b[0], v0.b[7] +; CHECK-GI-NEXT: str b0, [x0] +; CHECK-GI-NEXT: ret +; +; CHECK-SD-LABEL: test_vst1_lane_s8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: st1 { v0.b }[7], [x0] +; CHECK-SD-NEXT: ret entry: %0 = extractelement <8 x i8> %b, i32 7 store i8 %0, ptr %a, align 1 diff --git a/llvm/test/CodeGen/AArch64/arm64-st1.ll b/llvm/test/CodeGen/AArch64/arm64-st1.ll index df37231a086f0..b37b952aa5b03 100644 --- a/llvm/test/CodeGen/AArch64/arm64-st1.ll +++ b/llvm/test/CodeGen/AArch64/arm64-st1.ll @@ -5,10 +5,17 @@ ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -verify-machineinstrs -mcpu=exynos-m3 | FileCheck --check-prefixes=CHECK,EXYNOS %s define void @st1lane_16b(<16 x i8> %A, ptr %D) { -; CHECK-LABEL: st1lane_16b: -; CHECK: add x8, x0, #1 -; CHECK: st1.b { v0 }[1], [x8] - +; SD-CHECK-LABEL: st1lane_16b: +; SD-CHECK: mov.b v0[0], v0[1] +; SD-CHECK: stur b0, [x0, #1] +; +; GI-CHECK-LABEL: st1lane_16b: +; GI-CHECK: add x8, x0, #1 +; GI-CHECK: st1.b { v0 }[1], [x8] +; +; EXYNOS-LABEL: st1lane_16b: +; EXYNOS: mov.b v0[0], v0[1] +; EXYNOS: stur b0, [x0, #1] %ptr = getelementptr i8, ptr %D, i64 1 %tmp = extractelement <16 x i8> %A, i32 1 store i8 %tmp, ptr %ptr @@ -17,14 +24,14 @@ define void @st1lane_16b(<16 x i8> %A, ptr %D) { define void @st1lane0_16b(<16 x i8> %A, ptr %D) { ; SD-CHECK-LABEL: st1lane0_16b: -; SD-CHECK: str b0, [x0, #1] +; SD-CHECK: stur b0, [x0, #1] ; ; GI-CHECK-LABEL: st1lane0_16b: ; GI-CHECK: add x8, x0, #1 ; GI-CHECK: st1.b { v0 }[0], [x8] ; ; EXYNOS-LABEL: st1lane0_16b: -; EXYNOS: str b0, [x0, #1] +; EXYNOS: stur b0, [x0, #1] %ptr = getelementptr i8, ptr %D, i64 1 %tmp = extractelement <16 x i8> %A, i32 0 @@ -49,10 +56,17 @@ define void @st1lane0u_16b(<16 x i8> %A, ptr %D) { } define void @st1lane_ro_16b(<16 x i8> %A, ptr %D, i64 %offset) { -; CHECK-LABEL: st1lane_ro_16b: -; CHECK: add x8, x0, x1 -; CHECK: st1.b { v0 }[1], [x8] - +; SD-CHECK-LABEL: st1lane_ro_16b: +; SD-CHECK: mov.b v0[0], v0[1] +; SD-CHECK: str b0, [x0, x1] +; +; GI-CHECK-LABEL: st1lane_ro_16b: +; GI-CHECK: add x8, x0, x1 +; GI-CHECK: st1.b { v0 }[1], [x8] +; +; EXYNOS-LABEL: st1lane_ro_16b: +; EXYNOS: mov.b v0[0], v0[1] +; EXYNOS: str b0, [x0, x1] %ptr = getelementptr i8, ptr %D, i64 %offset %tmp = extractelement <16 x i8> %A, i32 1 store i8 %tmp, ptr %ptr @@ -311,10 +325,17 @@ define void @st1lane0_ro_2d_double(<2 x double> %A, ptr %D, i64 %offset) { } define void @st1lane_8b(<8 x i8> %A, ptr %D) { -; CHECK-LABEL: st1lane_8b: -; CHECK: add x8, x0, #1 -; CHECK: st1.b { v0 }[1], [x8] - +; SD-CHECK-LABEL: st1lane_8b: +; SD-CHECK: mov.b v0[0], v0[1] +; SD-CHECK: stur b0, [x0, #1] +; +; GI-CHECK-LABEL: st1lane_8b: +; GI-CHECK: add x8, x0, #1 +; GI-CHECK: st1.b { v0 }[1], [x8] +; +; EXYNOS-LABEL: st1lane_8b: +; EXYNOS: mov.b v0[0], v0[1] +; EXYNOS: stur b0, [x0, #1] %ptr = getelementptr i8, ptr %D, i64 1 %tmp = extractelement <8 x i8> %A, i32 1 store i8 %tmp, ptr %ptr @@ -322,10 +343,17 @@ define void @st1lane_8b(<8 x i8> %A, ptr %D) { } define void @st1lane_ro_8b(<8 x i8> %A, ptr %D, i64 %offset) { -; CHECK-LABEL: st1lane_ro_8b: -; CHECK: add x8, x0, x1 -; CHECK: st1.b { v0 }[1], [x8] - +; SD-CHECK-LABEL: st1lane_ro_8b: +; SD-CHECK: mov.b v0[0], v0[1] +; SD-CHECK: str b0, [x0, x1] +; +; GI-CHECK-LABEL: st1lane_ro_8b: +; GI-CHECK: add x8, x0, x1 +; GI-CHECK: st1.b { v0 }[1], [x8] +; +; EXYNOS-LABEL: st1lane_ro_8b: +; EXYNOS: mov.b v0[0], v0[1] +; EXYNOS: str b0, [x0, x1] %ptr = getelementptr i8, ptr %D, i64 %offset %tmp = extractelement <8 x i8> %A, i32 1 store i8 %tmp, ptr %ptr diff --git a/llvm/test/CodeGen/AArch64/bitcast-v2i8.ll b/llvm/test/CodeGen/AArch64/bitcast-v2i8.ll index 77304aef4385e..05f66e4b03ed2 100644 --- a/llvm/test/CodeGen/AArch64/bitcast-v2i8.ll +++ b/llvm/test/CodeGen/AArch64/bitcast-v2i8.ll @@ -1,13 +1,20 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=aarch64-apple-ios -disable-post-ra | FileCheck %s ; Part of PR21549: going through the stack isn't ideal but is correct. define i16 @test_bitcast_v2i8_to_i16(<2 x i8> %a) { -; CHECK-LABEL: test_bitcast_v2i8_to_i16 -; CHECK: mov.s [[WREG_HI:w[0-9]+]], v0[1] -; CHECK-NEXT: strb [[WREG_HI]], [sp, #15] -; CHECK-NEXT: str [[WREG_LO:b[0-9]+]], [sp, #14] -; CHECK-NEXT: ldrh w0, [sp, #14] +; CHECK-LABEL: test_bitcast_v2i8_to_i16: +; CHECK: ; %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov.s v1[0], v0[1] +; CHECK-NEXT: str b0, [sp, #14] +; CHECK-NEXT: stur b1, [sp, #15] +; CHECK-NEXT: ldrh w0, [sp, #14] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret %aa = bitcast <2 x i8> %a to i16 ret i16 %aa diff --git a/llvm/test/CodeGen/AArch64/ctlz.ll b/llvm/test/CodeGen/AArch64/ctlz.ll index 79676efebe776..f795050e568e6 100644 --- a/llvm/test/CodeGen/AArch64/ctlz.ll +++ b/llvm/test/CodeGen/AArch64/ctlz.ll @@ -13,9 +13,9 @@ define void @v2i8(ptr %p1) { ; CHECK-SD-NEXT: mov v1.s[1], w9 ; CHECK-SD-NEXT: clz v1.2s, v1.2s ; CHECK-SD-NEXT: sub v0.2s, v1.2s, v0.2s -; CHECK-SD-NEXT: mov w8, v0.s[1] +; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] ; CHECK-SD-NEXT: str b0, [x0] -; CHECK-SD-NEXT: strb w8, [x0, #1] +; CHECK-SD-NEXT: stur b1, [x0, #1] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: v2i8: @@ -48,11 +48,11 @@ define void @v3i8(ptr %p1) { ; CHECK-SD-NEXT: clz v1.4h, v1.4h ; CHECK-SD-NEXT: sub v0.4h, v1.4h, v0.4h ; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b -; CHECK-SD-NEXT: umov w8, v0.h[2] +; CHECK-SD-NEXT: mov v0.h[0], v0.h[2] ; CHECK-SD-NEXT: str s1, [sp, #12] -; CHECK-SD-NEXT: ldrh w9, [sp, #12] -; CHECK-SD-NEXT: strb w8, [x0, #2] -; CHECK-SD-NEXT: strh w9, [x0] +; CHECK-SD-NEXT: ldrh w8, [sp, #12] +; CHECK-SD-NEXT: stur b0, [x0, #2] +; CHECK-SD-NEXT: strh w8, [x0] ; CHECK-SD-NEXT: add sp, sp, #16 ; CHECK-SD-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/ctpop.ll b/llvm/test/CodeGen/AArch64/ctpop.ll index 767b9d28d6215..d9cbac7a4c691 100644 --- a/llvm/test/CodeGen/AArch64/ctpop.ll +++ b/llvm/test/CodeGen/AArch64/ctpop.ll @@ -13,9 +13,9 @@ define void @v2i8(ptr %p1) { ; CHECK-SD-NEXT: cnt v0.8b, v0.8b ; CHECK-SD-NEXT: uaddlp v0.4h, v0.8b ; CHECK-SD-NEXT: uaddlp v0.2s, v0.4h -; CHECK-SD-NEXT: mov w8, v0.s[1] +; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] ; CHECK-SD-NEXT: str b0, [x0] -; CHECK-SD-NEXT: strb w8, [x0, #1] +; CHECK-SD-NEXT: stur b1, [x0, #1] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: v2i8: @@ -47,11 +47,11 @@ define void @v3i8(ptr %p1) { ; CHECK-SD-NEXT: cnt v0.8b, v0.8b ; CHECK-SD-NEXT: uaddlp v0.4h, v0.8b ; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b -; CHECK-SD-NEXT: umov w8, v0.h[2] +; CHECK-SD-NEXT: mov v0.h[0], v0.h[2] ; CHECK-SD-NEXT: str s1, [sp, #12] -; CHECK-SD-NEXT: ldrh w9, [sp, #12] -; CHECK-SD-NEXT: strb w8, [x0, #2] -; CHECK-SD-NEXT: strh w9, [x0] +; CHECK-SD-NEXT: ldrh w8, [sp, #12] +; CHECK-SD-NEXT: stur b0, [x0, #2] +; CHECK-SD-NEXT: strh w8, [x0] ; CHECK-SD-NEXT: add sp, sp, #16 ; CHECK-SD-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/cttz.ll b/llvm/test/CodeGen/AArch64/cttz.ll index 97f5a29064c67..1d9af77eb4a05 100644 --- a/llvm/test/CodeGen/AArch64/cttz.ll +++ b/llvm/test/CodeGen/AArch64/cttz.ll @@ -16,9 +16,9 @@ define void @v2i8(ptr %p1) { ; CHECK-SD-NEXT: movi v1.2s, #32 ; CHECK-SD-NEXT: clz v0.2s, v0.2s ; CHECK-SD-NEXT: sub v0.2s, v1.2s, v0.2s -; CHECK-SD-NEXT: mov w8, v0.s[1] +; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] ; CHECK-SD-NEXT: str b0, [x0] -; CHECK-SD-NEXT: strb w8, [x0, #1] +; CHECK-SD-NEXT: stur b1, [x0, #1] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: v2i8: @@ -58,11 +58,11 @@ define void @v3i8(ptr %p1) { ; CHECK-SD-NEXT: clz v0.4h, v0.4h ; CHECK-SD-NEXT: sub v0.4h, v1.4h, v0.4h ; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b -; CHECK-SD-NEXT: umov w8, v0.h[2] +; CHECK-SD-NEXT: mov v0.h[0], v0.h[2] ; CHECK-SD-NEXT: str s1, [sp, #12] -; CHECK-SD-NEXT: ldrh w9, [sp, #12] -; CHECK-SD-NEXT: strb w8, [x0, #2] -; CHECK-SD-NEXT: strh w9, [x0] +; CHECK-SD-NEXT: ldrh w8, [sp, #12] +; CHECK-SD-NEXT: stur b0, [x0, #2] +; CHECK-SD-NEXT: strh w8, [x0] ; CHECK-SD-NEXT: add sp, sp, #16 ; CHECK-SD-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/mul.ll b/llvm/test/CodeGen/AArch64/mul.ll index 0d7a6a7dbcb11..0270083ad1d06 100644 --- a/llvm/test/CodeGen/AArch64/mul.ll +++ b/llvm/test/CodeGen/AArch64/mul.ll @@ -75,9 +75,9 @@ define void @v2i8(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8] ; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9] ; CHECK-SD-NEXT: mul v0.2s, v0.2s, v1.2s -; CHECK-SD-NEXT: mov w8, v0.s[1] +; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] ; CHECK-SD-NEXT: str b0, [x0] -; CHECK-SD-NEXT: strb w8, [x0, #1] +; CHECK-SD-NEXT: stur b1, [x0, #1] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: v2i8: @@ -112,11 +112,11 @@ define void @v3i8(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v0.8b ; CHECK-SD-NEXT: mul v0.4h, v0.4h, v1.4h ; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b -; CHECK-SD-NEXT: umov w8, v0.h[2] +; CHECK-SD-NEXT: mov v0.h[0], v0.h[2] ; CHECK-SD-NEXT: str s1, [sp, #12] -; CHECK-SD-NEXT: ldrh w9, [sp, #12] -; CHECK-SD-NEXT: strb w8, [x0, #2] -; CHECK-SD-NEXT: strh w9, [x0] +; CHECK-SD-NEXT: ldrh w8, [sp, #12] +; CHECK-SD-NEXT: stur b0, [x0, #2] +; CHECK-SD-NEXT: strh w8, [x0] ; CHECK-SD-NEXT: add sp, sp, #16 ; CHECK-SD-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/neon-truncstore.ll b/llvm/test/CodeGen/AArch64/neon-truncstore.ll index a070e3d7565ed..c501faa1c567a 100644 --- a/llvm/test/CodeGen/AArch64/neon-truncstore.ll +++ b/llvm/test/CodeGen/AArch64/neon-truncstore.ll @@ -89,9 +89,9 @@ define void @v2i32_v2i8(<2 x i32> %a, ptr %result) { ; CHECK-LABEL: v2i32_v2i8: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: mov v1.s[0], v0.s[1] ; CHECK-NEXT: str b0, [x0] -; CHECK-NEXT: strb w8, [x0, #1] +; CHECK-NEXT: stur b1, [x0, #1] ; CHECK-NEXT: ret %b = trunc <2 x i32> %a to <2 x i8> store <2 x i8> %b, ptr %result @@ -155,9 +155,9 @@ define void @v2i16_v2i8(<2 x i16> %a, ptr %result) { ; CHECK-LABEL: v2i16_v2i8: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: mov v1.s[0], v0.s[1] ; CHECK-NEXT: str b0, [x0] -; CHECK-NEXT: strb w8, [x0, #1] +; CHECK-NEXT: stur b1, [x0, #1] ; CHECK-NEXT: ret %b = trunc <2 x i16> %a to <2 x i8> store <2 x i8> %b, ptr %result diff --git a/llvm/test/CodeGen/AArch64/nontemporal-load.ll b/llvm/test/CodeGen/AArch64/nontemporal-load.ll index 28cff55beff9e..adb209c0c6348 100644 --- a/llvm/test/CodeGen/AArch64/nontemporal-load.ll +++ b/llvm/test/CodeGen/AArch64/nontemporal-load.ll @@ -451,7 +451,7 @@ define <33 x i8> @test_ldnp_v33i8(ptr %A) { ; CHECK-NEXT: ldnp q0, q1, [x0] ; CHECK-NEXT: ldr b2, [x0, #32] ; CHECK-NEXT: stp q0, q1, [x8] -; CHECK-NEXT: str b2, [x8, #32] +; CHECK-NEXT: stur b2, [x8, #32] ; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_ldnp_v33i8: diff --git a/llvm/test/CodeGen/AArch64/pr-cf624b2.ll b/llvm/test/CodeGen/AArch64/pr-cf624b2.ll index 0b0540e559abd..f17570837515c 100644 --- a/llvm/test/CodeGen/AArch64/pr-cf624b2.ll +++ b/llvm/test/CodeGen/AArch64/pr-cf624b2.ll @@ -11,45 +11,31 @@ define linkonce_odr void @_ZN1y2beEPiRK1vPmPS1_(<8 x i8> %0, ptr %agg.tmp.i) { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: mov x8, sp ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: mov v1.b[0], v0.b[7] +; CHECK-NEXT: mov v2.b[0], v0.b[6] +; CHECK-NEXT: stur b0, [sp, #15] +; CHECK-NEXT: stur b0, [sp, #14] +; CHECK-NEXT: stur b0, [sp, #13] +; CHECK-NEXT: stur b0, [sp, #12] +; CHECK-NEXT: stur b1, [sp, #7] +; CHECK-NEXT: mov v1.b[0], v0.b[5] +; CHECK-NEXT: stur b2, [sp, #6] +; CHECK-NEXT: mov v2.b[0], v0.b[4] +; CHECK-NEXT: stur b0, [sp, #11] +; CHECK-NEXT: stur b0, [sp, #10] +; CHECK-NEXT: stur b1, [sp, #5] +; CHECK-NEXT: mov v1.b[0], v0.b[3] +; CHECK-NEXT: stur b0, [sp, #9] +; CHECK-NEXT: stur b2, [sp, #4] +; CHECK-NEXT: mov v2.b[0], v0.b[2] ; CHECK-NEXT: str b0, [sp] -; CHECK-NEXT: orr x9, x8, #0xf -; CHECK-NEXT: orr x10, x8, #0xe -; CHECK-NEXT: st1 { v0.b }[15], [x9] -; CHECK-NEXT: orr x9, x8, #0xc -; CHECK-NEXT: st1 { v0.b }[12], [x9] -; CHECK-NEXT: orr x9, x8, #0x8 -; CHECK-NEXT: st1 { v0.b }[8], [x9] -; CHECK-NEXT: orr x9, x8, #0x7 -; CHECK-NEXT: st1 { v0.b }[7], [x9] -; CHECK-NEXT: orr x9, x8, #0x6 -; CHECK-NEXT: st1 { v0.b }[6], [x9] -; CHECK-NEXT: orr x9, x8, #0x4 -; CHECK-NEXT: st1 { v0.b }[4], [x9] -; CHECK-NEXT: orr x9, x8, #0x3 -; CHECK-NEXT: st1 { v0.b }[3], [x9] -; CHECK-NEXT: orr x9, x8, #0x2 -; CHECK-NEXT: st1 { v0.b }[14], [x10] -; CHECK-NEXT: mov w10, #13 // =0xd -; CHECK-NEXT: st1 { v0.b }[2], [x9] -; CHECK-NEXT: orr x9, x8, #0x1 -; CHECK-NEXT: st1 { v0.b }[1], [x9] -; CHECK-NEXT: orr x9, x8, x10 -; CHECK-NEXT: mov w10, #11 // =0xb -; CHECK-NEXT: st1 { v0.b }[13], [x9] -; CHECK-NEXT: orr x9, x8, x10 -; CHECK-NEXT: mov w10, #10 // =0xa -; CHECK-NEXT: st1 { v0.b }[11], [x9] -; CHECK-NEXT: orr x9, x8, x10 -; CHECK-NEXT: mov w10, #9 // =0x9 -; CHECK-NEXT: st1 { v0.b }[10], [x9] -; CHECK-NEXT: orr x9, x8, x10 -; CHECK-NEXT: mov w10, #5 // =0x5 -; CHECK-NEXT: orr x8, x8, x10 -; CHECK-NEXT: st1 { v0.b }[9], [x9] -; CHECK-NEXT: st1 { v0.b }[5], [x8] +; CHECK-NEXT: mov v0.b[0], v0.b[1] +; CHECK-NEXT: stur b1, [sp, #3] +; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: stur b2, [sp, #2] +; CHECK-NEXT: stur b0, [sp, #8] +; CHECK-NEXT: stur b0, [sp, #1] ; CHECK-NEXT: ldr q0, [sp] ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: add sp, sp, #16 diff --git a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll index cbb3b06030bae..18457d2b27781 100644 --- a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll @@ -200,9 +200,9 @@ define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-SD-NEXT: shl v0.2s, v0.2s, #24 ; CHECK-SD-NEXT: sqadd v0.2s, v0.2s, v1.2s ; CHECK-SD-NEXT: ushr v0.2s, v0.2s, #24 -; CHECK-SD-NEXT: mov w8, v0.s[1] +; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] ; CHECK-SD-NEXT: str b0, [x2] -; CHECK-SD-NEXT: strb w8, [x2, #1] +; CHECK-SD-NEXT: stur b1, [x2, #1] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: v2i8: diff --git a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll index 04b379f455008..257d2a1c1ebda 100644 --- a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll @@ -201,9 +201,9 @@ define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-SD-NEXT: shl v0.2s, v0.2s, #24 ; CHECK-SD-NEXT: sqsub v0.2s, v0.2s, v1.2s ; CHECK-SD-NEXT: ushr v0.2s, v0.2s, #24 -; CHECK-SD-NEXT: mov w8, v0.s[1] +; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] ; CHECK-SD-NEXT: str b0, [x2] -; CHECK-SD-NEXT: strb w8, [x2, #1] +; CHECK-SD-NEXT: stur b1, [x2, #1] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: v2i8: diff --git a/llvm/test/CodeGen/AArch64/store.ll b/llvm/test/CodeGen/AArch64/store.ll index 7ea957d9d165d..296b860be2a76 100644 --- a/llvm/test/CodeGen/AArch64/store.ll +++ b/llvm/test/CodeGen/AArch64/store.ll @@ -110,9 +110,9 @@ define void @store_v2i8(<2 x i8> %a, ptr %ptr){ ; CHECK-SD-LABEL: store_v2i8: ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-SD-NEXT: mov w8, v0.s[1] +; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] ; CHECK-SD-NEXT: str b0, [x0] -; CHECK-SD-NEXT: strb w8, [x0, #1] +; CHECK-SD-NEXT: stur b1, [x0, #1] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: store_v2i8: @@ -230,12 +230,12 @@ define void @store_v3i8(<3 x i8> %a, ptr %ptr){ define void @store_v7i8(<7 x i8> %a, ptr %ptr){ ; CHECK-SD-LABEL: store_v7i8: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: add x8, x0, #6 -; CHECK-SD-NEXT: add x9, x0, #4 ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: mov v1.b[0], v0.b[6] +; CHECK-SD-NEXT: add x8, x0, #4 ; CHECK-SD-NEXT: str s0, [x0] -; CHECK-SD-NEXT: st1 { v0.b }[6], [x8] -; CHECK-SD-NEXT: st1 { v0.h }[2], [x9] +; CHECK-SD-NEXT: st1 { v0.h }[2], [x8] +; CHECK-SD-NEXT: stur b1, [x0, #6] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: store_v7i8: diff --git a/llvm/test/CodeGen/AArch64/sub.ll b/llvm/test/CodeGen/AArch64/sub.ll index 91a17a89af6e1..c3cc6169f3969 100644 --- a/llvm/test/CodeGen/AArch64/sub.ll +++ b/llvm/test/CodeGen/AArch64/sub.ll @@ -63,9 +63,9 @@ define void @v2i8(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8] ; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9] ; CHECK-SD-NEXT: sub v0.2s, v0.2s, v1.2s -; CHECK-SD-NEXT: mov w8, v0.s[1] +; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] ; CHECK-SD-NEXT: str b0, [x0] -; CHECK-SD-NEXT: strb w8, [x0, #1] +; CHECK-SD-NEXT: stur b1, [x0, #1] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: v2i8: @@ -100,11 +100,11 @@ define void @v3i8(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v0.8b ; CHECK-SD-NEXT: sub v0.4h, v0.4h, v1.4h ; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b -; CHECK-SD-NEXT: umov w8, v0.h[2] +; CHECK-SD-NEXT: mov v0.h[0], v0.h[2] ; CHECK-SD-NEXT: str s1, [sp, #12] -; CHECK-SD-NEXT: ldrh w9, [sp, #12] -; CHECK-SD-NEXT: strb w8, [x0, #2] -; CHECK-SD-NEXT: strh w9, [x0] +; CHECK-SD-NEXT: ldrh w8, [sp, #12] +; CHECK-SD-NEXT: stur b0, [x0, #2] +; CHECK-SD-NEXT: strh w8, [x0] ; CHECK-SD-NEXT: add sp, sp, #16 ; CHECK-SD-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll index 89a06bc9d5b4e..27aa5019fb259 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll @@ -73,7 +73,7 @@ define void @alloc_v6i8(ptr %st_ptr) nounwind { ; CHECK-NEXT: zip1 z1.s, z1.s, z0.s ; CHECK-NEXT: st1b { z1.h }, p0, [x8] ; CHECK-NEXT: ld1h { z1.s }, p1/z, [x8] -; CHECK-NEXT: str b0, [x19, #2] +; CHECK-NEXT: stur b0, [x19, #2] ; CHECK-NEXT: str h1, [x19] ; CHECK-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #32 @@ -122,7 +122,7 @@ define void @alloc_v32i8(ptr %st_ptr) nounwind { ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_0] ; CHECK-NEXT: tbl z0.b, { z0.b }, z1.b ; CHECK-NEXT: ldr q1, [sp, #16] -; CHECK-NEXT: str b1, [x19, #8] +; CHECK-NEXT: stur b1, [x19, #8] ; CHECK-NEXT: str d0, [x19] ; CHECK-NEXT: ldp x30, x19, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #48 diff --git a/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll b/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll index 184e8fff154b9..f2389b3e94846 100644 --- a/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll +++ b/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll @@ -706,7 +706,7 @@ define void @trunc_v11i64_to_v11i8_in_loop(ptr %A, ptr %dst) { ; CHECK-NEXT: LBB6_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldp q4, q0, [x0, #48] -; CHECK-NEXT: add x9, x1, #10 +; CHECK-NEXT: add x9, x1, #8 ; CHECK-NEXT: ldr d1, [x0, #80] ; CHECK-NEXT: ldp q3, q2, [x0] ; CHECK-NEXT: ldr q5, [x0, #32] @@ -719,10 +719,11 @@ define void @trunc_v11i64_to_v11i8_in_loop(ptr %A, ptr %dst) { ; CHECK-NEXT: uzp1.8h v1, v2, v1 ; CHECK-NEXT: uzp1.8b v2, v0, v0 ; CHECK-NEXT: uzp1.16b v0, v1, v0 -; CHECK-NEXT: st1.b { v2 }[2], [x9] -; CHECK-NEXT: add x9, x1, #8 +; CHECK-NEXT: mov.b v1[0], v2[2] +; CHECK-NEXT: str d0, [x1] ; CHECK-NEXT: st1.h { v0 }[4], [x9] -; CHECK-NEXT: str d0, [x1], #16 +; CHECK-NEXT: stur b1, [x1, #10] +; CHECK-NEXT: add x1, x1, #16 ; CHECK-NEXT: b.eq LBB6_1 ; CHECK-NEXT: ; %bb.2: ; %exit ; CHECK-NEXT: ret @@ -742,7 +743,7 @@ define void @trunc_v11i64_to_v11i8_in_loop(ptr %A, ptr %dst) { ; CHECK-BE-NEXT: ld1 { v2.2d }, [x9] ; CHECK-BE-NEXT: ldr d5, [x0, #80] ; CHECK-BE-NEXT: ld1 { v4.2d }, [x10] -; CHECK-BE-NEXT: add x9, x1, #10 +; CHECK-BE-NEXT: add x9, x1, #8 ; CHECK-BE-NEXT: subs x8, x8, #1 ; CHECK-BE-NEXT: uzp1 v1.4s, v3.4s, v1.4s ; CHECK-BE-NEXT: uzp1 v0.4s, v0.4s, v5.4s @@ -754,10 +755,11 @@ define void @trunc_v11i64_to_v11i8_in_loop(ptr %A, ptr %dst) { ; CHECK-BE-NEXT: uzp1 v0.8b, v0.8b, v0.8b ; CHECK-BE-NEXT: rev16 v2.16b, v1.16b ; CHECK-BE-NEXT: rev64 v1.16b, v1.16b -; CHECK-BE-NEXT: st1 { v0.b }[2], [x9] -; CHECK-BE-NEXT: add x9, x1, #8 +; CHECK-BE-NEXT: mov v0.b[0], v0.b[2] +; CHECK-BE-NEXT: str d1, [x1] +; CHECK-BE-NEXT: stur b0, [x1, #10] +; CHECK-BE-NEXT: add x1, x1, #16 ; CHECK-BE-NEXT: st1 { v2.h }[4], [x9] -; CHECK-BE-NEXT: str d1, [x1], #16 ; CHECK-BE-NEXT: b.eq .LBB6_1 ; CHECK-BE-NEXT: // %bb.2: // %exit ; CHECK-BE-NEXT: ret @@ -777,7 +779,7 @@ define void @trunc_v11i64_to_v11i8_in_loop(ptr %A, ptr %dst) { ; CHECK-DISABLE-NEXT: ld1 { v2.2d }, [x9] ; CHECK-DISABLE-NEXT: ldr d5, [x0, #80] ; CHECK-DISABLE-NEXT: ld1 { v4.2d }, [x10] -; CHECK-DISABLE-NEXT: add x9, x1, #10 +; CHECK-DISABLE-NEXT: add x9, x1, #8 ; CHECK-DISABLE-NEXT: subs x8, x8, #1 ; CHECK-DISABLE-NEXT: uzp1 v1.4s, v3.4s, v1.4s ; CHECK-DISABLE-NEXT: uzp1 v0.4s, v0.4s, v5.4s @@ -789,10 +791,11 @@ define void @trunc_v11i64_to_v11i8_in_loop(ptr %A, ptr %dst) { ; CHECK-DISABLE-NEXT: uzp1 v0.8b, v0.8b, v0.8b ; CHECK-DISABLE-NEXT: rev16 v2.16b, v1.16b ; CHECK-DISABLE-NEXT: rev64 v1.16b, v1.16b -; CHECK-DISABLE-NEXT: st1 { v0.b }[2], [x9] -; CHECK-DISABLE-NEXT: add x9, x1, #8 +; CHECK-DISABLE-NEXT: mov v0.b[0], v0.b[2] +; CHECK-DISABLE-NEXT: str d1, [x1] +; CHECK-DISABLE-NEXT: stur b0, [x1, #10] +; CHECK-DISABLE-NEXT: add x1, x1, #16 ; CHECK-DISABLE-NEXT: st1 { v2.h }[4], [x9] -; CHECK-DISABLE-NEXT: str d1, [x1], #16 ; CHECK-DISABLE-NEXT: b.eq .LBB6_1 ; CHECK-DISABLE-NEXT: // %bb.2: // %exit ; CHECK-DISABLE-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll index edd96ae4836a4..19178964710cd 100644 --- a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll @@ -198,9 +198,9 @@ define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-SD-NEXT: mov v1.s[1], w11 ; CHECK-SD-NEXT: add v0.2s, v0.2s, v1.2s ; CHECK-SD-NEXT: umin v0.2s, v0.2s, v2.2s -; CHECK-SD-NEXT: mov w8, v0.s[1] +; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] ; CHECK-SD-NEXT: str b0, [x2] -; CHECK-SD-NEXT: strb w8, [x2, #1] +; CHECK-SD-NEXT: stur b1, [x2, #1] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: v2i8: diff --git a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll index 63ca1b51c2291..443bd46bb71da 100644 --- a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll @@ -197,9 +197,9 @@ define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-SD-NEXT: mov v0.s[1], w10 ; CHECK-SD-NEXT: mov v1.s[1], w11 ; CHECK-SD-NEXT: uqsub v0.2s, v0.2s, v1.2s -; CHECK-SD-NEXT: mov w8, v0.s[1] +; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] ; CHECK-SD-NEXT: str b0, [x2] -; CHECK-SD-NEXT: strb w8, [x2, #1] +; CHECK-SD-NEXT: stur b1, [x2, #1] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: v2i8: diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll index d9b5a42ba98a6..3c42079dc8d8a 100644 --- a/llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll +++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll @@ -35,8 +35,7 @@ define void @store_8_elements(<8 x i16> %vec, ptr %out) { ; CHECK-NEXT: ldr q1, [x8, lCPI1_0@PAGEOFF] ; CHECK-NEXT: bic.16b v0, v1, v0 ; CHECK-NEXT: addv.8h h0, v0 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: strb w8, [x0] +; CHECK-NEXT: str b0, [x0] ; CHECK-NEXT: ret ; CHECK-NEXT: .loh AdrpLdr Lloh2, Lloh3 @@ -76,8 +75,7 @@ define void @store_2_elements(<2 x i64> %vec, ptr %out) { ; CHECK-NEXT: ldr q1, [x8, lCPI3_0@PAGEOFF] ; CHECK-NEXT: bic.16b v0, v1, v0 ; CHECK-NEXT: addp.2d d0, v0 -; CHECK-NEXT: fmov x8, d0 -; CHECK-NEXT: strb w8, [x0] +; CHECK-NEXT: str b0, [x0] ; CHECK-NEXT: ret ; CHECK-NEXT: .loh AdrpLdr Lloh6, Lloh7 @@ -119,8 +117,7 @@ define void @add_trunc_mask_unknown_vector_type(<4 x i1> %vec, ptr %out) { ; CHECK-NEXT: cmlt.4h v0, v0, #0 ; CHECK-NEXT: and.8b v0, v0, v1 ; CHECK-NEXT: addv.4h h0, v0 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: strb w8, [x0] +; CHECK-NEXT: str b0, [x0] ; CHECK-NEXT: ret ; CHECK-NEXT: .loh AdrpLdr Lloh10, Lloh11 @@ -159,8 +156,7 @@ define void @store_4_elements_64_bit_vector(<4 x i16> %vec, ptr %out) { ; CHECK-NEXT: ldr d1, [x8, lCPI7_0@PAGEOFF] ; CHECK-NEXT: bic.8b v0, v1, v0 ; CHECK-NEXT: addv.4h h0, v0 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: strb w8, [x0] +; CHECK-NEXT: str b0, [x0] ; CHECK-NEXT: ret ; CHECK-NEXT: .loh AdrpLdr Lloh14, Lloh15 diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll index 4aa7fa8b22b4f..66b37d1913505 100644 --- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll +++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll @@ -355,14 +355,14 @@ define <3 x i32> @load_v3i8_sext_to_3xi32(ptr %src) { define void @store_trunc_from_64bits(ptr %src, ptr %dst) { ; CHECK-LABEL: store_trunc_from_64bits: ; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: add x8, x0, #4 +; CHECK-NEXT: ld1r.4h { v0 }, [x8] ; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: add x9, x0, #4 -; CHECK-NEXT: ld1r.4h { v0 }, [x9] ; CHECK-NEXT: lsr w9, w8, #16 ; CHECK-NEXT: strb w8, [x1] -; CHECK-NEXT: add x8, x1, #2 +; CHECK-NEXT: mov.b v0[0], v0[4] ; CHECK-NEXT: strb w9, [x1, #1] -; CHECK-NEXT: st1.b { v0 }[4], [x8] +; CHECK-NEXT: stur b0, [x1, #2] ; CHECK-NEXT: ret ; ; BE-LABEL: store_trunc_from_64bits: @@ -397,13 +397,13 @@ define void @store_trunc_add_from_64bits(ptr %src, ptr %dst) { ; CHECK-NEXT: adrp x8, lCPI11_0@PAGE ; CHECK-NEXT: Lloh1: ; CHECK-NEXT: ldr d1, [x8, lCPI11_0@PAGEOFF] -; CHECK-NEXT: add x8, x1, #1 ; CHECK-NEXT: ld1.h { v0 }[2], [x9] -; CHECK-NEXT: add x9, x1, #2 ; CHECK-NEXT: add.4h v0, v0, v1 -; CHECK-NEXT: st1.b { v0 }[2], [x8] -; CHECK-NEXT: st1.b { v0 }[4], [x9] +; CHECK-NEXT: mov.b v1[0], v0[2] +; CHECK-NEXT: mov.b v2[0], v0[4] ; CHECK-NEXT: str b0, [x1] +; CHECK-NEXT: stur b1, [x1, #1] +; CHECK-NEXT: stur b2, [x1, #2] ; CHECK-NEXT: ret ; CHECK-NEXT: .loh AdrpLdr Lloh0, Lloh1 ; @@ -420,12 +420,12 @@ define void @store_trunc_add_from_64bits(ptr %src, ptr %dst) { ; BE-NEXT: ld1 { v1.4h }, [x8] ; BE-NEXT: add v0.4h, v0.4h, v1.4h ; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b -; BE-NEXT: umov w8, v0.h[2] +; BE-NEXT: mov v0.h[0], v0.h[2] ; BE-NEXT: rev32 v1.16b, v1.16b ; BE-NEXT: str s1, [sp, #12] -; BE-NEXT: ldrh w9, [sp, #12] -; BE-NEXT: strb w8, [x1, #2] -; BE-NEXT: strh w9, [x1] +; BE-NEXT: ldrh w8, [sp, #12] +; BE-NEXT: stur b0, [x1, #2] +; BE-NEXT: strh w8, [x1] ; BE-NEXT: add sp, sp, #16 ; BE-NEXT: ret entry: @@ -587,12 +587,12 @@ define void @shift_trunc_store(ptr %src, ptr %dst) { ; CHECK-LABEL: shift_trunc_store: ; CHECK: ; %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: add x8, x1, #1 -; CHECK-NEXT: add x9, x1, #2 ; CHECK-NEXT: ushr.4s v0, v0, #16 -; CHECK-NEXT: st1.b { v0 }[4], [x8] -; CHECK-NEXT: st1.b { v0 }[8], [x9] +; CHECK-NEXT: mov.b v1[0], v0[4] +; CHECK-NEXT: mov.b v2[0], v0[8] ; CHECK-NEXT: str b0, [x1] +; CHECK-NEXT: stur b1, [x1, #1] +; CHECK-NEXT: stur b2, [x1, #2] ; CHECK-NEXT: ret ; ; BE-LABEL: shift_trunc_store: @@ -602,12 +602,12 @@ define void @shift_trunc_store(ptr %src, ptr %dst) { ; BE-NEXT: ld1 { v0.4s }, [x0] ; BE-NEXT: shrn v0.4h, v0.4s, #16 ; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b -; BE-NEXT: umov w8, v0.h[2] +; BE-NEXT: mov v0.h[0], v0.h[2] ; BE-NEXT: rev32 v1.16b, v1.16b ; BE-NEXT: str s1, [sp, #12] -; BE-NEXT: ldrh w9, [sp, #12] -; BE-NEXT: strb w8, [x1, #2] -; BE-NEXT: strh w9, [x1] +; BE-NEXT: ldrh w8, [sp, #12] +; BE-NEXT: stur b0, [x1, #2] +; BE-NEXT: strh w8, [x1] ; BE-NEXT: add sp, sp, #16 ; BE-NEXT: ret %l = load <3 x i32>, ptr %src @@ -621,12 +621,12 @@ define void @shift_trunc_store_default_align(ptr %src, ptr %dst) { ; CHECK-LABEL: shift_trunc_store_default_align: ; CHECK: ; %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: add x8, x1, #1 -; CHECK-NEXT: add x9, x1, #2 ; CHECK-NEXT: ushr.4s v0, v0, #16 -; CHECK-NEXT: st1.b { v0 }[4], [x8] -; CHECK-NEXT: st1.b { v0 }[8], [x9] +; CHECK-NEXT: mov.b v1[0], v0[4] +; CHECK-NEXT: mov.b v2[0], v0[8] ; CHECK-NEXT: str b0, [x1] +; CHECK-NEXT: stur b1, [x1, #1] +; CHECK-NEXT: stur b2, [x1, #2] ; CHECK-NEXT: ret ; ; BE-LABEL: shift_trunc_store_default_align: @@ -636,12 +636,12 @@ define void @shift_trunc_store_default_align(ptr %src, ptr %dst) { ; BE-NEXT: ld1 { v0.4s }, [x0] ; BE-NEXT: shrn v0.4h, v0.4s, #16 ; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b -; BE-NEXT: umov w8, v0.h[2] +; BE-NEXT: mov v0.h[0], v0.h[2] ; BE-NEXT: rev32 v1.16b, v1.16b ; BE-NEXT: str s1, [sp, #12] -; BE-NEXT: ldrh w9, [sp, #12] -; BE-NEXT: strb w8, [x1, #2] -; BE-NEXT: strh w9, [x1] +; BE-NEXT: ldrh w8, [sp, #12] +; BE-NEXT: stur b0, [x1, #2] +; BE-NEXT: strh w8, [x1] ; BE-NEXT: add sp, sp, #16 ; BE-NEXT: ret %l = load <3 x i32>, ptr %src @@ -655,12 +655,12 @@ define void @shift_trunc_store_align_4(ptr %src, ptr %dst) { ; CHECK-LABEL: shift_trunc_store_align_4: ; CHECK: ; %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: add x8, x1, #1 -; CHECK-NEXT: add x9, x1, #2 ; CHECK-NEXT: ushr.4s v0, v0, #16 -; CHECK-NEXT: st1.b { v0 }[4], [x8] -; CHECK-NEXT: st1.b { v0 }[8], [x9] +; CHECK-NEXT: mov.b v1[0], v0[4] +; CHECK-NEXT: mov.b v2[0], v0[8] ; CHECK-NEXT: str b0, [x1] +; CHECK-NEXT: stur b1, [x1, #1] +; CHECK-NEXT: stur b2, [x1, #2] ; CHECK-NEXT: ret ; ; BE-LABEL: shift_trunc_store_align_4: @@ -670,12 +670,12 @@ define void @shift_trunc_store_align_4(ptr %src, ptr %dst) { ; BE-NEXT: ld1 { v0.4s }, [x0] ; BE-NEXT: shrn v0.4h, v0.4s, #16 ; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b -; BE-NEXT: umov w8, v0.h[2] +; BE-NEXT: mov v0.h[0], v0.h[2] ; BE-NEXT: rev32 v1.16b, v1.16b ; BE-NEXT: str s1, [sp, #12] -; BE-NEXT: ldrh w9, [sp, #12] -; BE-NEXT: strb w8, [x1, #2] -; BE-NEXT: strh w9, [x1] +; BE-NEXT: ldrh w8, [sp, #12] +; BE-NEXT: stur b0, [x1, #2] +; BE-NEXT: strh w8, [x1] ; BE-NEXT: add sp, sp, #16 ; BE-NEXT: ret %l = load <3 x i32>, ptr %src @@ -689,12 +689,12 @@ define void @shift_trunc_store_const_offset_1(ptr %src, ptr %dst) { ; CHECK-LABEL: shift_trunc_store_const_offset_1: ; CHECK: ; %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: add x8, x1, #2 -; CHECK-NEXT: add x9, x1, #3 ; CHECK-NEXT: ushr.4s v0, v0, #16 -; CHECK-NEXT: st1.b { v0 }[4], [x8] -; CHECK-NEXT: st1.b { v0 }[8], [x9] -; CHECK-NEXT: str b0, [x1, #1] +; CHECK-NEXT: mov.b v1[0], v0[4] +; CHECK-NEXT: mov.b v2[0], v0[8] +; CHECK-NEXT: stur b0, [x1, #1] +; CHECK-NEXT: stur b1, [x1, #2] +; CHECK-NEXT: stur b2, [x1, #3] ; CHECK-NEXT: ret ; ; BE-LABEL: shift_trunc_store_const_offset_1: @@ -704,12 +704,12 @@ define void @shift_trunc_store_const_offset_1(ptr %src, ptr %dst) { ; BE-NEXT: ld1 { v0.4s }, [x0] ; BE-NEXT: shrn v0.4h, v0.4s, #16 ; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b -; BE-NEXT: umov w8, v0.h[2] +; BE-NEXT: mov v0.h[0], v0.h[2] ; BE-NEXT: rev32 v1.16b, v1.16b ; BE-NEXT: str s1, [sp, #12] -; BE-NEXT: ldrh w9, [sp, #12] -; BE-NEXT: strb w8, [x1, #3] -; BE-NEXT: sturh w9, [x1, #1] +; BE-NEXT: ldrh w8, [sp, #12] +; BE-NEXT: stur b0, [x1, #3] +; BE-NEXT: sturh w8, [x1, #1] ; BE-NEXT: add sp, sp, #16 ; BE-NEXT: ret %l = load <3 x i32>, ptr %src @@ -724,12 +724,12 @@ define void @shift_trunc_store_const_offset_3(ptr %src, ptr %dst) { ; CHECK-LABEL: shift_trunc_store_const_offset_3: ; CHECK: ; %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: add x8, x1, #4 -; CHECK-NEXT: add x9, x1, #5 ; CHECK-NEXT: ushr.4s v0, v0, #16 -; CHECK-NEXT: st1.b { v0 }[4], [x8] -; CHECK-NEXT: st1.b { v0 }[8], [x9] -; CHECK-NEXT: str b0, [x1, #3] +; CHECK-NEXT: mov.b v1[0], v0[4] +; CHECK-NEXT: mov.b v2[0], v0[8] +; CHECK-NEXT: stur b0, [x1, #3] +; CHECK-NEXT: stur b1, [x1, #4] +; CHECK-NEXT: stur b2, [x1, #5] ; CHECK-NEXT: ret ; ; BE-LABEL: shift_trunc_store_const_offset_3: @@ -739,12 +739,12 @@ define void @shift_trunc_store_const_offset_3(ptr %src, ptr %dst) { ; BE-NEXT: ld1 { v0.4s }, [x0] ; BE-NEXT: shrn v0.4h, v0.4s, #16 ; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b -; BE-NEXT: umov w8, v0.h[2] +; BE-NEXT: mov v0.h[0], v0.h[2] ; BE-NEXT: rev32 v1.16b, v1.16b ; BE-NEXT: str s1, [sp, #12] -; BE-NEXT: ldrh w9, [sp, #12] -; BE-NEXT: strb w8, [x1, #5] -; BE-NEXT: sturh w9, [x1, #3] +; BE-NEXT: ldrh w8, [sp, #12] +; BE-NEXT: stur b0, [x1, #5] +; BE-NEXT: sturh w8, [x1, #3] ; BE-NEXT: add sp, sp, #16 ; BE-NEXT: ret %l = load <3 x i32>, ptr %src @@ -763,11 +763,11 @@ define void @shift_trunc_volatile_store(ptr %src, ptr %dst) { ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: shrn.4h v0, v0, #16 ; CHECK-NEXT: uzp1.8b v1, v0, v0 -; CHECK-NEXT: umov.h w8, v0[2] +; CHECK-NEXT: mov.h v0[0], v0[2] ; CHECK-NEXT: str s1, [sp, #12] -; CHECK-NEXT: ldrh w9, [sp, #12] -; CHECK-NEXT: strb w8, [x1, #2] -; CHECK-NEXT: strh w9, [x1] +; CHECK-NEXT: ldrh w8, [sp, #12] +; CHECK-NEXT: stur b0, [x1, #2] +; CHECK-NEXT: strh w8, [x1] ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret ; @@ -778,12 +778,12 @@ define void @shift_trunc_volatile_store(ptr %src, ptr %dst) { ; BE-NEXT: ld1 { v0.4s }, [x0] ; BE-NEXT: shrn v0.4h, v0.4s, #16 ; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b -; BE-NEXT: umov w8, v0.h[2] +; BE-NEXT: mov v0.h[0], v0.h[2] ; BE-NEXT: rev32 v1.16b, v1.16b ; BE-NEXT: str s1, [sp, #12] -; BE-NEXT: ldrh w9, [sp, #12] -; BE-NEXT: strb w8, [x1, #2] -; BE-NEXT: strh w9, [x1] +; BE-NEXT: ldrh w8, [sp, #12] +; BE-NEXT: stur b0, [x1, #2] +; BE-NEXT: strh w8, [x1] ; BE-NEXT: add sp, sp, #16 ; BE-NEXT: ret %l = load <3 x i32>, ptr %src @@ -802,15 +802,15 @@ define void @load_v3i8_zext_to_3xi32_add_trunc_store(ptr %src) { ; CHECK-NEXT: adrp x8, lCPI22_0@PAGE ; CHECK-NEXT: Lloh5: ; CHECK-NEXT: ldr q1, [x8, lCPI22_0@PAGEOFF] -; CHECK-NEXT: add x8, x0, #2 ; CHECK-NEXT: orr w9, w10, w9, lsl #16 ; CHECK-NEXT: fmov s0, w9 -; CHECK-NEXT: add x9, x0, #1 ; CHECK-NEXT: zip1.8b v0, v0, v0 ; CHECK-NEXT: uaddw.4s v0, v1, v0 -; CHECK-NEXT: st1.b { v0 }[8], [x8] -; CHECK-NEXT: st1.b { v0 }[4], [x9] +; CHECK-NEXT: mov.b v1[0], v0[8] +; CHECK-NEXT: mov.b v2[0], v0[4] ; CHECK-NEXT: str b0, [x0] +; CHECK-NEXT: stur b1, [x0, #2] +; CHECK-NEXT: stur b2, [x0, #1] ; CHECK-NEXT: ret ; CHECK-NEXT: .loh AdrpLdr Lloh4, Lloh5 ; @@ -830,12 +830,12 @@ define void @load_v3i8_zext_to_3xi32_add_trunc_store(ptr %src) { ; BE-NEXT: ld1 { v0.b }[4], [x9] ; BE-NEXT: add v0.4h, v0.4h, v1.4h ; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b -; BE-NEXT: umov w8, v0.h[2] +; BE-NEXT: mov v0.h[0], v0.h[2] ; BE-NEXT: rev32 v1.16b, v1.16b ; BE-NEXT: str s1, [sp, #8] -; BE-NEXT: ldrh w9, [sp, #8] -; BE-NEXT: strb w8, [x0, #2] -; BE-NEXT: strh w9, [x0] +; BE-NEXT: ldrh w8, [sp, #8] +; BE-NEXT: stur b0, [x0, #2] +; BE-NEXT: strh w8, [x0] ; BE-NEXT: add sp, sp, #16 ; BE-NEXT: ret %l = load <3 x i8>, ptr %src, align 1 @@ -855,15 +855,15 @@ define void @load_v3i8_sext_to_3xi32_add_trunc_store(ptr %src) { ; CHECK-NEXT: adrp x8, lCPI23_0@PAGE ; CHECK-NEXT: Lloh7: ; CHECK-NEXT: ldr q1, [x8, lCPI23_0@PAGEOFF] -; CHECK-NEXT: add x8, x0, #2 ; CHECK-NEXT: orr w9, w10, w9, lsl #16 ; CHECK-NEXT: fmov s0, w9 -; CHECK-NEXT: add x9, x0, #1 ; CHECK-NEXT: zip1.8b v0, v0, v0 ; CHECK-NEXT: uaddw.4s v0, v1, v0 -; CHECK-NEXT: st1.b { v0 }[8], [x8] -; CHECK-NEXT: st1.b { v0 }[4], [x9] +; CHECK-NEXT: mov.b v1[0], v0[8] +; CHECK-NEXT: mov.b v2[0], v0[4] ; CHECK-NEXT: str b0, [x0] +; CHECK-NEXT: stur b1, [x0, #2] +; CHECK-NEXT: stur b2, [x0, #1] ; CHECK-NEXT: ret ; CHECK-NEXT: .loh AdrpLdr Lloh6, Lloh7 ; @@ -883,12 +883,12 @@ define void @load_v3i8_sext_to_3xi32_add_trunc_store(ptr %src) { ; BE-NEXT: ld1 { v0.b }[4], [x9] ; BE-NEXT: add v0.4h, v0.4h, v1.4h ; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b -; BE-NEXT: umov w8, v0.h[2] +; BE-NEXT: mov v0.h[0], v0.h[2] ; BE-NEXT: rev32 v1.16b, v1.16b ; BE-NEXT: str s1, [sp, #8] -; BE-NEXT: ldrh w9, [sp, #8] -; BE-NEXT: strb w8, [x0, #2] -; BE-NEXT: strh w9, [x0] +; BE-NEXT: ldrh w8, [sp, #8] +; BE-NEXT: stur b0, [x0, #2] +; BE-NEXT: strh w8, [x0] ; BE-NEXT: add sp, sp, #16 ; BE-NEXT: ret %l = load <3 x i8>, ptr %src, align 1 diff --git a/llvm/test/CodeGen/AArch64/vec_uaddo.ll b/llvm/test/CodeGen/AArch64/vec_uaddo.ll index 37c6374215d81..09662aef7e423 100644 --- a/llvm/test/CodeGen/AArch64/vec_uaddo.ll +++ b/llvm/test/CodeGen/AArch64/vec_uaddo.ll @@ -249,15 +249,14 @@ define <4 x i32> @uaddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind { ; CHECK-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-NEXT: adrp x8, .LCPI10_0 ; CHECK-NEXT: shl v1.4h, v2.4h, #15 -; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI10_0] ; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI10_0] ; CHECK-NEXT: cmlt v1.4h, v1.4h, #0 ; CHECK-NEXT: shl v0.4s, v0.4s, #31 ; CHECK-NEXT: and v1.8b, v1.8b, v2.8b ; CHECK-NEXT: cmlt v0.4s, v0.4s, #0 ; CHECK-NEXT: addv h1, v1.4h -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: strb w8, [x0] +; CHECK-NEXT: str b1, [x0] ; CHECK-NEXT: ret %t = call {<4 x i1>, <4 x i1>} @llvm.uadd.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1) %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0 diff --git a/llvm/test/CodeGen/AArch64/vec_umulo.ll b/llvm/test/CodeGen/AArch64/vec_umulo.ll index 3a481efd9785a..7803c095b77c2 100644 --- a/llvm/test/CodeGen/AArch64/vec_umulo.ll +++ b/llvm/test/CodeGen/AArch64/vec_umulo.ll @@ -299,11 +299,10 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind { ; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI10_0] ; CHECK-NEXT: shl v0.4h, v0.4h, #15 ; CHECK-NEXT: cmlt v0.4h, v0.4h, #0 -; CHECK-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-NEXT: addv h1, v0.4h +; CHECK-NEXT: and v1.8b, v0.8b, v1.8b ; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: strb w8, [x0] +; CHECK-NEXT: addv h1, v1.4h +; CHECK-NEXT: str b1, [x0] ; CHECK-NEXT: ret %t = call {<4 x i1>, <4 x i1>} @llvm.umul.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1) %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0 diff --git a/llvm/test/CodeGen/AArch64/vector-compress.ll b/llvm/test/CodeGen/AArch64/vector-compress.ll index f990bdc2e5615..0979a80f7f22e 100644 --- a/llvm/test/CodeGen/AArch64/vector-compress.ll +++ b/llvm/test/CodeGen/AArch64/vector-compress.ll @@ -107,94 +107,109 @@ define <16 x i8> @test_compress_v16i8(<16 x i8> %vec, <16 x i1> %mask) { ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: shl.16b v1, v1, #7 -; CHECK-NEXT: mov x12, sp -; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: mov.b v2[0], v0[1] +; CHECK-NEXT: mov x10, sp ; CHECK-NEXT: str b0, [sp] -; CHECK-NEXT: mov x13, sp +; CHECK-NEXT: mov.b v3[0], v0[2] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: mov.b v4[0], v0[3] +; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: cmlt.16b v1, v1, #0 -; CHECK-NEXT: umov.b w9, v1[0] -; CHECK-NEXT: umov.b w10, v1[1] -; CHECK-NEXT: umov.b w11, v1[2] +; CHECK-NEXT: umov.b w11, v1[0] +; CHECK-NEXT: umov.b w12, v1[1] +; CHECK-NEXT: umov.b w13, v1[2] ; CHECK-NEXT: umov.b w14, v1[3] -; CHECK-NEXT: bfxil x12, x9, #0, #1 -; CHECK-NEXT: and x10, x10, #0x1 -; CHECK-NEXT: and x9, x9, #0x1 -; CHECK-NEXT: add x9, x9, x10 -; CHECK-NEXT: umov.b w10, v1[4] +; CHECK-NEXT: bfxil x10, x11, #0, #1 ; CHECK-NEXT: and x11, x11, #0x1 -; CHECK-NEXT: st1.b { v0 }[1], [x12] -; CHECK-NEXT: orr x12, x8, x9 -; CHECK-NEXT: add x9, x9, x11 -; CHECK-NEXT: umov.b w11, v1[5] +; CHECK-NEXT: and x13, x13, #0x1 ; CHECK-NEXT: and x14, x14, #0x1 -; CHECK-NEXT: st1.b { v0 }[2], [x12] -; CHECK-NEXT: add x14, x9, x14 -; CHECK-NEXT: umov.b w12, v1[6] -; CHECK-NEXT: orr x9, x8, x9 -; CHECK-NEXT: and x10, x10, #0x1 -; CHECK-NEXT: st1.b { v0 }[3], [x9] -; CHECK-NEXT: orr x9, x8, x14 -; CHECK-NEXT: add x10, x14, x10 -; CHECK-NEXT: umov.b w14, v1[7] -; CHECK-NEXT: st1.b { v0 }[4], [x9] -; CHECK-NEXT: and x11, x11, #0x1 -; CHECK-NEXT: bfxil x13, x10, #0, #4 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: add x10, x10, x11 -; CHECK-NEXT: umov.b w11, v1[8] +; CHECK-NEXT: str b2, [x10] +; CHECK-NEXT: and x10, x12, #0x1 +; CHECK-NEXT: umov.b w12, v1[4] +; CHECK-NEXT: mov.b v2[0], v0[4] +; CHECK-NEXT: add x10, x11, x10 +; CHECK-NEXT: umov.b w11, v1[5] +; CHECK-NEXT: add x13, x10, x13 +; CHECK-NEXT: orr x10, x8, x10 +; CHECK-NEXT: str b3, [x10] +; CHECK-NEXT: orr x10, x8, x13 +; CHECK-NEXT: add x13, x13, x14 ; CHECK-NEXT: and x12, x12, #0x1 -; CHECK-NEXT: bfxil x9, x10, #0, #4 -; CHECK-NEXT: st1.b { v0 }[5], [x13] -; CHECK-NEXT: umov.b w13, v1[9] -; CHECK-NEXT: add x10, x10, x12 -; CHECK-NEXT: mov x12, sp +; CHECK-NEXT: umov.b w14, v1[6] +; CHECK-NEXT: str b4, [x10] +; CHECK-NEXT: add x12, x13, x12 +; CHECK-NEXT: orr x13, x8, x13 +; CHECK-NEXT: mov.b v3[0], v0[5] +; CHECK-NEXT: str b2, [x13] +; CHECK-NEXT: umov.b w13, v1[7] +; CHECK-NEXT: and x11, x11, #0x1 +; CHECK-NEXT: bfxil x9, x12, #0, #4 +; CHECK-NEXT: add x11, x12, x11 +; CHECK-NEXT: umov.b w12, v1[8] +; CHECK-NEXT: mov.b v4[0], v0[6] +; CHECK-NEXT: mov x10, sp ; CHECK-NEXT: and x14, x14, #0x1 -; CHECK-NEXT: st1.b { v0 }[6], [x9] -; CHECK-NEXT: umov.b w9, v1[10] -; CHECK-NEXT: bfxil x12, x10, #0, #4 -; CHECK-NEXT: add x10, x10, x14 +; CHECK-NEXT: mov.b v2[0], v0[7] +; CHECK-NEXT: bfxil x10, x11, #0, #4 +; CHECK-NEXT: add x11, x11, x14 ; CHECK-NEXT: mov x14, sp -; CHECK-NEXT: and x11, x11, #0x1 -; CHECK-NEXT: bfxil x14, x10, #0, #4 -; CHECK-NEXT: add x10, x10, x11 -; CHECK-NEXT: mov x11, sp +; CHECK-NEXT: str b3, [x9] +; CHECK-NEXT: umov.b w9, v1[9] ; CHECK-NEXT: and x13, x13, #0x1 -; CHECK-NEXT: st1.b { v0 }[7], [x12] -; CHECK-NEXT: mov x12, sp -; CHECK-NEXT: bfxil x11, x10, #0, #4 -; CHECK-NEXT: add x10, x10, x13 -; CHECK-NEXT: umov.b w13, v1[11] -; CHECK-NEXT: st1.b { v0 }[8], [x14] -; CHECK-NEXT: umov.b w14, v1[12] +; CHECK-NEXT: mov.b v3[0], v0[8] +; CHECK-NEXT: bfxil x14, x11, #0, #4 +; CHECK-NEXT: add x11, x11, x13 +; CHECK-NEXT: mov x13, sp +; CHECK-NEXT: and x12, x12, #0x1 +; CHECK-NEXT: str b4, [x10] +; CHECK-NEXT: bfxil x13, x11, #0, #4 +; CHECK-NEXT: add x10, x11, x12 +; CHECK-NEXT: umov.b w12, v1[10] +; CHECK-NEXT: str b2, [x14] +; CHECK-NEXT: mov.b v2[0], v0[9] +; CHECK-NEXT: mov x11, sp ; CHECK-NEXT: and x9, x9, #0x1 -; CHECK-NEXT: bfxil x12, x10, #0, #4 +; CHECK-NEXT: str b3, [x13] +; CHECK-NEXT: mov.b v3[0], v0[10] +; CHECK-NEXT: umov.b w13, v1[11] +; CHECK-NEXT: bfxil x11, x10, #0, #4 ; CHECK-NEXT: add x9, x10, x9 ; CHECK-NEXT: mov x10, sp -; CHECK-NEXT: st1.b { v0 }[9], [x11] -; CHECK-NEXT: umov.b w11, v1[13] +; CHECK-NEXT: mov.b v4[0], v0[11] ; CHECK-NEXT: bfxil x10, x9, #0, #4 -; CHECK-NEXT: st1.b { v0 }[10], [x12] -; CHECK-NEXT: umov.b w12, v1[14] -; CHECK-NEXT: and x13, x13, #0x1 -; CHECK-NEXT: and x14, x14, #0x1 -; CHECK-NEXT: add x9, x9, x13 -; CHECK-NEXT: st1.b { v0 }[11], [x10] +; CHECK-NEXT: and x12, x12, #0x1 +; CHECK-NEXT: umov.b w14, v1[12] +; CHECK-NEXT: add x9, x9, x12 +; CHECK-NEXT: mov x12, sp +; CHECK-NEXT: str b2, [x11] +; CHECK-NEXT: umov.b w11, v1[13] +; CHECK-NEXT: bfxil x12, x9, #0, #4 +; CHECK-NEXT: str b3, [x10] +; CHECK-NEXT: and x10, x13, #0x1 +; CHECK-NEXT: umov.b w13, v1[14] +; CHECK-NEXT: mov.b v1[0], v0[12] +; CHECK-NEXT: str b4, [x12] +; CHECK-NEXT: add x9, x9, x10 ; CHECK-NEXT: mov x10, sp -; CHECK-NEXT: add x13, x9, x14 -; CHECK-NEXT: mov x14, sp +; CHECK-NEXT: and x12, x14, #0x1 ; CHECK-NEXT: bfxil x10, x9, #0, #4 -; CHECK-NEXT: and x9, x11, #0x1 +; CHECK-NEXT: mov.b v2[0], v0[13] +; CHECK-NEXT: add x9, x9, x12 +; CHECK-NEXT: mov x12, sp +; CHECK-NEXT: and x11, x11, #0x1 +; CHECK-NEXT: bfxil x12, x9, #0, #4 +; CHECK-NEXT: add x9, x9, x11 +; CHECK-NEXT: mov.b v3[0], v0[14] ; CHECK-NEXT: mov x11, sp -; CHECK-NEXT: add x9, x13, x9 -; CHECK-NEXT: and w12, w12, #0x1 -; CHECK-NEXT: bfxil x14, x13, #0, #4 +; CHECK-NEXT: and w13, w13, #0x1 +; CHECK-NEXT: mov.b v0[0], v0[15] ; CHECK-NEXT: bfxil x11, x9, #0, #4 -; CHECK-NEXT: add w9, w9, w12 -; CHECK-NEXT: st1.b { v0 }[12], [x10] +; CHECK-NEXT: add w9, w9, w13 +; CHECK-NEXT: str b1, [x10] ; CHECK-NEXT: bfxil x8, x9, #0, #4 -; CHECK-NEXT: st1.b { v0 }[13], [x14] -; CHECK-NEXT: st1.b { v0 }[14], [x11] -; CHECK-NEXT: st1.b { v0 }[15], [x8] +; CHECK-NEXT: str b2, [x12] +; CHECK-NEXT: str b3, [x11] +; CHECK-NEXT: str b0, [x8] ; CHECK-NEXT: ldr q0, [sp], #16 ; CHECK-NEXT: ret %out = call <16 x i8> @llvm.experimental.vector.compress(<16 x i8> %vec, <16 x i1> %mask, <16 x i8> undef) diff --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll index 6536f0c355b47..e3c4fe44d201d 100644 --- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll +++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll @@ -2702,28 +2702,29 @@ define void @zext_v8i8_to_v8i33_in_loop(ptr %src, ptr %dst) { ; CHECK-BE-NEXT: ushll2 v1.2d, v1.4s, #0 ; CHECK-BE-NEXT: ushll2 v3.2d, v0.4s, #0 ; CHECK-BE-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-BE-NEXT: mov x9, v2.d[1] ; CHECK-BE-NEXT: mov x10, v1.d[1] +; CHECK-BE-NEXT: mov x9, v2.d[1] ; CHECK-BE-NEXT: fmov x13, d1 ; CHECK-BE-NEXT: mov x11, v3.d[1] ; CHECK-BE-NEXT: mov x12, v0.d[1] -; CHECK-BE-NEXT: fmov x14, d2 -; CHECK-BE-NEXT: fmov x15, d3 +; CHECK-BE-NEXT: mov v1.d[0], v1.d[1] +; CHECK-BE-NEXT: orr x10, x10, x13, lsl #33 +; CHECK-BE-NEXT: fmov x13, d2 ; CHECK-BE-NEXT: lsl x9, x9, #2 -; CHECK-BE-NEXT: orr x13, x10, x13, lsl #33 -; CHECK-BE-NEXT: strb w10, [x1, #32] ; CHECK-BE-NEXT: lsl x11, x11, #4 ; CHECK-BE-NEXT: lsl x12, x12, #6 -; CHECK-BE-NEXT: orr x14, x9, x14, lsl #35 -; CHECK-BE-NEXT: extr x9, x9, x13, #8 +; CHECK-BE-NEXT: stur b1, [x1, #32] +; CHECK-BE-NEXT: orr x13, x9, x13, lsl #35 +; CHECK-BE-NEXT: extr x9, x9, x10, #8 +; CHECK-BE-NEXT: fmov x10, d3 +; CHECK-BE-NEXT: orr x10, x11, x10, lsl #37 +; CHECK-BE-NEXT: extr x11, x11, x13, #8 ; CHECK-BE-NEXT: fmov x13, d0 -; CHECK-BE-NEXT: orr x15, x11, x15, lsl #37 -; CHECK-BE-NEXT: extr x10, x11, x14, #8 -; CHECK-BE-NEXT: orr x11, x12, x13, lsl #39 -; CHECK-BE-NEXT: extr x12, x12, x15, #8 -; CHECK-BE-NEXT: stp x10, x9, [x1, #16] -; CHECK-BE-NEXT: lsr x9, x11, #8 -; CHECK-BE-NEXT: stp x9, x12, [x1], #128 +; CHECK-BE-NEXT: stp x11, x9, [x1, #16] +; CHECK-BE-NEXT: extr x9, x12, x10, #8 +; CHECK-BE-NEXT: orr x13, x12, x13, lsl #39 +; CHECK-BE-NEXT: lsr x10, x13, #8 +; CHECK-BE-NEXT: stp x10, x9, [x1], #128 ; CHECK-BE-NEXT: b.ne .LBB22_1 ; CHECK-BE-NEXT: // %bb.2: // %exit ; CHECK-BE-NEXT: ret From 7842bc45e600530925ffe57619205eeefa2d2283 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Wed, 2 Apr 2025 22:24:31 +0000 Subject: [PATCH 03/12] Prefer st1.b in some cases --- .../Target/AArch64/AArch64ISelLowering.cpp | 9 + .../AArch64/arm64-neon-simd-ldst-one.ll | 31 +--- llvm/test/CodeGen/AArch64/vector-compress.ll | 155 ++++++++---------- 3 files changed, 88 insertions(+), 107 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 186bd484132b8..ef7d5b162847b 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -24046,6 +24046,15 @@ static SDValue performSTORECombine(SDNode *N, return SDValue(); if (MemVT == MVT::i8) { + auto *ExtCst = dyn_cast(ExtIdx); + if (Subtarget->isNeonAvailable() && + (VectorVT == MVT::v8i8 || VectorVT == MVT::v16i8) && ExtCst && + !ExtCst->isZero() && ST->getBasePtr().getOpcode() != ISD::ADD) { + // These can lower to st1.b, which is preferable if we're unlikely to + // fold the addressing into the store. + return SDValue(); + } + SDValue Zero = DAG.getConstant(0, DL, MVT::i64); SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Value.getValueType(), Vector, ExtIdx); diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll b/llvm/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll index 2ad567a79d6a5..7d87be0ce8e1c 100644 --- a/llvm/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll @@ -483,16 +483,10 @@ entry: } define void @test_vst1q_lane_s8(ptr %a, <16 x i8> %b) { -; CHECK-GI-LABEL: test_vst1q_lane_s8: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov v0.b[0], v0.b[15] -; CHECK-GI-NEXT: str b0, [x0] -; CHECK-GI-NEXT: ret -; -; CHECK-SD-LABEL: test_vst1q_lane_s8: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: st1 { v0.b }[15], [x0] -; CHECK-SD-NEXT: ret +; CHECK-LABEL: test_vst1q_lane_s8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: st1 { v0.b }[15], [x0] +; CHECK-NEXT: ret entry: %0 = extractelement <16 x i8> %b, i32 15 store i8 %0, ptr %a, align 1 @@ -610,18 +604,11 @@ entry: } define void @test_vst1_lane_s8(ptr %a, <8 x i8> %b) { -; CHECK-GI-LABEL: test_vst1_lane_s8: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: mov v0.b[0], v0.b[7] -; CHECK-GI-NEXT: str b0, [x0] -; CHECK-GI-NEXT: ret -; -; CHECK-SD-LABEL: test_vst1_lane_s8: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-SD-NEXT: st1 { v0.b }[7], [x0] -; CHECK-SD-NEXT: ret +; CHECK-LABEL: test_vst1_lane_s8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: st1 { v0.b }[7], [x0] +; CHECK-NEXT: ret entry: %0 = extractelement <8 x i8> %b, i32 7 store i8 %0, ptr %a, align 1 diff --git a/llvm/test/CodeGen/AArch64/vector-compress.ll b/llvm/test/CodeGen/AArch64/vector-compress.ll index 0979a80f7f22e..f990bdc2e5615 100644 --- a/llvm/test/CodeGen/AArch64/vector-compress.ll +++ b/llvm/test/CodeGen/AArch64/vector-compress.ll @@ -107,109 +107,94 @@ define <16 x i8> @test_compress_v16i8(<16 x i8> %vec, <16 x i1> %mask) { ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: shl.16b v1, v1, #7 -; CHECK-NEXT: mov.b v2[0], v0[1] -; CHECK-NEXT: mov x10, sp -; CHECK-NEXT: str b0, [sp] -; CHECK-NEXT: mov.b v3[0], v0[2] +; CHECK-NEXT: mov x12, sp ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov.b v4[0], v0[3] -; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: str b0, [sp] +; CHECK-NEXT: mov x13, sp ; CHECK-NEXT: cmlt.16b v1, v1, #0 -; CHECK-NEXT: umov.b w11, v1[0] -; CHECK-NEXT: umov.b w12, v1[1] -; CHECK-NEXT: umov.b w13, v1[2] +; CHECK-NEXT: umov.b w9, v1[0] +; CHECK-NEXT: umov.b w10, v1[1] +; CHECK-NEXT: umov.b w11, v1[2] ; CHECK-NEXT: umov.b w14, v1[3] -; CHECK-NEXT: bfxil x10, x11, #0, #1 +; CHECK-NEXT: bfxil x12, x9, #0, #1 +; CHECK-NEXT: and x10, x10, #0x1 +; CHECK-NEXT: and x9, x9, #0x1 +; CHECK-NEXT: add x9, x9, x10 +; CHECK-NEXT: umov.b w10, v1[4] ; CHECK-NEXT: and x11, x11, #0x1 -; CHECK-NEXT: and x13, x13, #0x1 -; CHECK-NEXT: and x14, x14, #0x1 -; CHECK-NEXT: str b2, [x10] -; CHECK-NEXT: and x10, x12, #0x1 -; CHECK-NEXT: umov.b w12, v1[4] -; CHECK-NEXT: mov.b v2[0], v0[4] -; CHECK-NEXT: add x10, x11, x10 +; CHECK-NEXT: st1.b { v0 }[1], [x12] +; CHECK-NEXT: orr x12, x8, x9 +; CHECK-NEXT: add x9, x9, x11 ; CHECK-NEXT: umov.b w11, v1[5] -; CHECK-NEXT: add x13, x10, x13 -; CHECK-NEXT: orr x10, x8, x10 -; CHECK-NEXT: str b3, [x10] -; CHECK-NEXT: orr x10, x8, x13 -; CHECK-NEXT: add x13, x13, x14 -; CHECK-NEXT: and x12, x12, #0x1 -; CHECK-NEXT: umov.b w14, v1[6] -; CHECK-NEXT: str b4, [x10] -; CHECK-NEXT: add x12, x13, x12 -; CHECK-NEXT: orr x13, x8, x13 -; CHECK-NEXT: mov.b v3[0], v0[5] -; CHECK-NEXT: str b2, [x13] -; CHECK-NEXT: umov.b w13, v1[7] +; CHECK-NEXT: and x14, x14, #0x1 +; CHECK-NEXT: st1.b { v0 }[2], [x12] +; CHECK-NEXT: add x14, x9, x14 +; CHECK-NEXT: umov.b w12, v1[6] +; CHECK-NEXT: orr x9, x8, x9 +; CHECK-NEXT: and x10, x10, #0x1 +; CHECK-NEXT: st1.b { v0 }[3], [x9] +; CHECK-NEXT: orr x9, x8, x14 +; CHECK-NEXT: add x10, x14, x10 +; CHECK-NEXT: umov.b w14, v1[7] +; CHECK-NEXT: st1.b { v0 }[4], [x9] ; CHECK-NEXT: and x11, x11, #0x1 -; CHECK-NEXT: bfxil x9, x12, #0, #4 -; CHECK-NEXT: add x11, x12, x11 -; CHECK-NEXT: umov.b w12, v1[8] -; CHECK-NEXT: mov.b v4[0], v0[6] -; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: bfxil x13, x10, #0, #4 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: add x10, x10, x11 +; CHECK-NEXT: umov.b w11, v1[8] +; CHECK-NEXT: and x12, x12, #0x1 +; CHECK-NEXT: bfxil x9, x10, #0, #4 +; CHECK-NEXT: st1.b { v0 }[5], [x13] +; CHECK-NEXT: umov.b w13, v1[9] +; CHECK-NEXT: add x10, x10, x12 +; CHECK-NEXT: mov x12, sp ; CHECK-NEXT: and x14, x14, #0x1 -; CHECK-NEXT: mov.b v2[0], v0[7] -; CHECK-NEXT: bfxil x10, x11, #0, #4 -; CHECK-NEXT: add x11, x11, x14 +; CHECK-NEXT: st1.b { v0 }[6], [x9] +; CHECK-NEXT: umov.b w9, v1[10] +; CHECK-NEXT: bfxil x12, x10, #0, #4 +; CHECK-NEXT: add x10, x10, x14 ; CHECK-NEXT: mov x14, sp -; CHECK-NEXT: str b3, [x9] -; CHECK-NEXT: umov.b w9, v1[9] -; CHECK-NEXT: and x13, x13, #0x1 -; CHECK-NEXT: mov.b v3[0], v0[8] -; CHECK-NEXT: bfxil x14, x11, #0, #4 -; CHECK-NEXT: add x11, x11, x13 -; CHECK-NEXT: mov x13, sp -; CHECK-NEXT: and x12, x12, #0x1 -; CHECK-NEXT: str b4, [x10] -; CHECK-NEXT: bfxil x13, x11, #0, #4 -; CHECK-NEXT: add x10, x11, x12 -; CHECK-NEXT: umov.b w12, v1[10] -; CHECK-NEXT: str b2, [x14] -; CHECK-NEXT: mov.b v2[0], v0[9] +; CHECK-NEXT: and x11, x11, #0x1 +; CHECK-NEXT: bfxil x14, x10, #0, #4 +; CHECK-NEXT: add x10, x10, x11 ; CHECK-NEXT: mov x11, sp -; CHECK-NEXT: and x9, x9, #0x1 -; CHECK-NEXT: str b3, [x13] -; CHECK-NEXT: mov.b v3[0], v0[10] -; CHECK-NEXT: umov.b w13, v1[11] +; CHECK-NEXT: and x13, x13, #0x1 +; CHECK-NEXT: st1.b { v0 }[7], [x12] +; CHECK-NEXT: mov x12, sp ; CHECK-NEXT: bfxil x11, x10, #0, #4 +; CHECK-NEXT: add x10, x10, x13 +; CHECK-NEXT: umov.b w13, v1[11] +; CHECK-NEXT: st1.b { v0 }[8], [x14] +; CHECK-NEXT: umov.b w14, v1[12] +; CHECK-NEXT: and x9, x9, #0x1 +; CHECK-NEXT: bfxil x12, x10, #0, #4 ; CHECK-NEXT: add x9, x10, x9 ; CHECK-NEXT: mov x10, sp -; CHECK-NEXT: mov.b v4[0], v0[11] -; CHECK-NEXT: bfxil x10, x9, #0, #4 -; CHECK-NEXT: and x12, x12, #0x1 -; CHECK-NEXT: umov.b w14, v1[12] -; CHECK-NEXT: add x9, x9, x12 -; CHECK-NEXT: mov x12, sp -; CHECK-NEXT: str b2, [x11] +; CHECK-NEXT: st1.b { v0 }[9], [x11] ; CHECK-NEXT: umov.b w11, v1[13] -; CHECK-NEXT: bfxil x12, x9, #0, #4 -; CHECK-NEXT: str b3, [x10] -; CHECK-NEXT: and x10, x13, #0x1 -; CHECK-NEXT: umov.b w13, v1[14] -; CHECK-NEXT: mov.b v1[0], v0[12] -; CHECK-NEXT: str b4, [x12] -; CHECK-NEXT: add x9, x9, x10 +; CHECK-NEXT: bfxil x10, x9, #0, #4 +; CHECK-NEXT: st1.b { v0 }[10], [x12] +; CHECK-NEXT: umov.b w12, v1[14] +; CHECK-NEXT: and x13, x13, #0x1 +; CHECK-NEXT: and x14, x14, #0x1 +; CHECK-NEXT: add x9, x9, x13 +; CHECK-NEXT: st1.b { v0 }[11], [x10] ; CHECK-NEXT: mov x10, sp -; CHECK-NEXT: and x12, x14, #0x1 +; CHECK-NEXT: add x13, x9, x14 +; CHECK-NEXT: mov x14, sp ; CHECK-NEXT: bfxil x10, x9, #0, #4 -; CHECK-NEXT: mov.b v2[0], v0[13] -; CHECK-NEXT: add x9, x9, x12 -; CHECK-NEXT: mov x12, sp -; CHECK-NEXT: and x11, x11, #0x1 -; CHECK-NEXT: bfxil x12, x9, #0, #4 -; CHECK-NEXT: add x9, x9, x11 -; CHECK-NEXT: mov.b v3[0], v0[14] +; CHECK-NEXT: and x9, x11, #0x1 ; CHECK-NEXT: mov x11, sp -; CHECK-NEXT: and w13, w13, #0x1 -; CHECK-NEXT: mov.b v0[0], v0[15] +; CHECK-NEXT: add x9, x13, x9 +; CHECK-NEXT: and w12, w12, #0x1 +; CHECK-NEXT: bfxil x14, x13, #0, #4 ; CHECK-NEXT: bfxil x11, x9, #0, #4 -; CHECK-NEXT: add w9, w9, w13 -; CHECK-NEXT: str b1, [x10] +; CHECK-NEXT: add w9, w9, w12 +; CHECK-NEXT: st1.b { v0 }[12], [x10] ; CHECK-NEXT: bfxil x8, x9, #0, #4 -; CHECK-NEXT: str b2, [x12] -; CHECK-NEXT: str b3, [x11] -; CHECK-NEXT: str b0, [x8] +; CHECK-NEXT: st1.b { v0 }[13], [x14] +; CHECK-NEXT: st1.b { v0 }[14], [x11] +; CHECK-NEXT: st1.b { v0 }[15], [x8] ; CHECK-NEXT: ldr q0, [sp], #16 ; CHECK-NEXT: ret %out = call <16 x i8> @llvm.experimental.vector.compress(<16 x i8> %vec, <16 x i1> %mask, <16 x i8> undef) From f6466cef2eba8f7091617065192c9302cf400dd4 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Thu, 3 Apr 2025 09:18:09 +0000 Subject: [PATCH 04/12] Avoid NOP movs --- .../Target/AArch64/AArch64ISelLowering.cpp | 16 +++- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 5 -- .../CodeGen/AArch64/aarch64-sve-ldst-one.ll | 90 ++++++------------- 3 files changed, 38 insertions(+), 73 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index ef7d5b162847b..7be6ae5603b25 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -24055,12 +24055,20 @@ static SDValue performSTORECombine(SDNode *N, return SDValue(); } + // Lower as truncstore of v1i64 -> v1i8 (which can lower to a bsub store). SDValue Zero = DAG.getConstant(0, DL, MVT::i64); - SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, - Value.getValueType(), Vector, ExtIdx); + SDValue ExtVector; EVT VecVT64 = get64BitVector(ElemVT); - SDValue ExtVector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT64, - DAG.getUNDEF(VecVT64), Ext, Zero); + if (ExtCst && ExtCst->isZero()) { + ExtVector = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT64, Vector, Zero); + } else { + SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, + Value.getValueType(), Vector, ExtIdx); + ExtVector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT64, + DAG.getUNDEF(VecVT64), Ext, Zero); + } + SDValue Cast = DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v1i64, ExtVector); return DAG.getTruncStore(ST->getChain(), DL, Cast, ST->getBasePtr(), MVT::v1i8, ST->getMemOperand()); diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 2f3e765c209de..8b419c4f7cb3a 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -7280,11 +7280,6 @@ multiclass Neon_INS_elt_pattern; - def : Pat<(VT64 (vector_insert (VT64 (undef)), - (VTScal (vector_extract (VT128 V128:$Rn), (i64 0))), - (i64 0))), - (EXTRACT_SUBREG $Rn, dsub)>; - def : Pat<(VT64 (vector_insert V64:$src, (VTScal (vector_extract (VT128 V128:$Rn), (i64 imm:$Immn))), (i64 imm:$Immd))), diff --git a/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll index 598aa69e30fa6..713ddd9aefe01 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll @@ -124,16 +124,10 @@ entry: } define void @test_str_lane0_s8(ptr %a, %b) { -; CHECK-NONSTREAMING-LABEL: test_str_lane0_s8: -; CHECK-NONSTREAMING: // %bb.0: // %entry -; CHECK-NONSTREAMING-NEXT: mov v0.b[0], v0.b[0] -; CHECK-NONSTREAMING-NEXT: str b0, [x0] -; CHECK-NONSTREAMING-NEXT: ret -; -; STREAMING-COMPAT-LABEL: test_str_lane0_s8: -; STREAMING-COMPAT: // %bb.0: // %entry -; STREAMING-COMPAT-NEXT: str b0, [x0] -; STREAMING-COMPAT-NEXT: ret +; CHECK-LABEL: test_str_lane0_s8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str b0, [x0] +; CHECK-NEXT: ret entry: %0 = extractelement %b, i32 0 store i8 %0, ptr %a, align 1 @@ -204,18 +198,11 @@ define void @test_str_reduction_i32_to_i16(ptr %ptr, %p0, %p0, %v) { -; CHECK-NONSTREAMING-LABEL: test_str_reduction_i32_to_i8: -; CHECK-NONSTREAMING: // %bb.0: -; CHECK-NONSTREAMING-NEXT: uaddv d0, p0, z0.s -; CHECK-NONSTREAMING-NEXT: mov v0.d[0], v0.d[0] -; CHECK-NONSTREAMING-NEXT: str b0, [x0] -; CHECK-NONSTREAMING-NEXT: ret -; -; STREAMING-COMPAT-LABEL: test_str_reduction_i32_to_i8: -; STREAMING-COMPAT: // %bb.0: -; STREAMING-COMPAT-NEXT: uaddv d0, p0, z0.s -; STREAMING-COMPAT-NEXT: str b0, [x0] -; STREAMING-COMPAT-NEXT: ret +; CHECK-LABEL: test_str_reduction_i32_to_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: uaddv d0, p0, z0.s +; CHECK-NEXT: str b0, [x0] +; CHECK-NEXT: ret %reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32( %p0, %v) %trunc = trunc i64 %reduce to i8 @@ -265,18 +252,11 @@ define void @test_str_reduction_i32_to_i16_negative_offset(ptr %ptr, %p0, %v) { -; CHECK-NONSTREAMING-LABEL: test_str_reduction_i32_to_i8_negative_offset: -; CHECK-NONSTREAMING: // %bb.0: -; CHECK-NONSTREAMING-NEXT: uaddv d0, p0, z0.s -; CHECK-NONSTREAMING-NEXT: mov v0.d[0], v0.d[0] -; CHECK-NONSTREAMING-NEXT: stur b0, [x0, #-8] -; CHECK-NONSTREAMING-NEXT: ret -; -; STREAMING-COMPAT-LABEL: test_str_reduction_i32_to_i8_negative_offset: -; STREAMING-COMPAT: // %bb.0: -; STREAMING-COMPAT-NEXT: uaddv d0, p0, z0.s -; STREAMING-COMPAT-NEXT: stur b0, [x0, #-8] -; STREAMING-COMPAT-NEXT: ret +; CHECK-LABEL: test_str_reduction_i32_to_i8_negative_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: uaddv d0, p0, z0.s +; CHECK-NEXT: stur b0, [x0, #-8] +; CHECK-NEXT: ret %reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32( %p0, %v) %trunc = trunc i64 %reduce to i8 @@ -359,16 +339,10 @@ entry: } define void @test_str_lane0_s8_negative_offset(ptr %a, %b) { -; CHECK-NONSTREAMING-LABEL: test_str_lane0_s8_negative_offset: -; CHECK-NONSTREAMING: // %bb.0: // %entry -; CHECK-NONSTREAMING-NEXT: mov v0.b[0], v0.b[0] -; CHECK-NONSTREAMING-NEXT: stur b0, [x0, #-8] -; CHECK-NONSTREAMING-NEXT: ret -; -; STREAMING-COMPAT-LABEL: test_str_lane0_s8_negative_offset: -; STREAMING-COMPAT: // %bb.0: // %entry -; STREAMING-COMPAT-NEXT: stur b0, [x0, #-8] -; STREAMING-COMPAT-NEXT: ret +; CHECK-LABEL: test_str_lane0_s8_negative_offset: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stur b0, [x0, #-8] +; CHECK-NEXT: ret entry: %0 = extractelement %b, i32 0 %out_ptr = getelementptr inbounds i8, ptr %a, i64 -8 @@ -451,16 +425,10 @@ entry: } define void @test_str_trunc_lane0_s32_to_s8(ptr %a, %b) { -; CHECK-NONSTREAMING-LABEL: test_str_trunc_lane0_s32_to_s8: -; CHECK-NONSTREAMING: // %bb.0: // %entry -; CHECK-NONSTREAMING-NEXT: mov v0.s[0], v0.s[0] -; CHECK-NONSTREAMING-NEXT: str b0, [x0] -; CHECK-NONSTREAMING-NEXT: ret -; -; STREAMING-COMPAT-LABEL: test_str_trunc_lane0_s32_to_s8: -; STREAMING-COMPAT: // %bb.0: // %entry -; STREAMING-COMPAT-NEXT: str b0, [x0] -; STREAMING-COMPAT-NEXT: ret +; CHECK-LABEL: test_str_trunc_lane0_s32_to_s8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str b0, [x0] +; CHECK-NEXT: ret entry: %0 = extractelement %b, i32 0 @@ -533,16 +501,10 @@ entry: } define void @test_str_trunc_lane0_s32_to_s8_negative_offset(ptr %a, %b) { -; CHECK-NONSTREAMING-LABEL: test_str_trunc_lane0_s32_to_s8_negative_offset: -; CHECK-NONSTREAMING: // %bb.0: // %entry -; CHECK-NONSTREAMING-NEXT: mov v0.s[0], v0.s[0] -; CHECK-NONSTREAMING-NEXT: stur b0, [x0, #-8] -; CHECK-NONSTREAMING-NEXT: ret -; -; STREAMING-COMPAT-LABEL: test_str_trunc_lane0_s32_to_s8_negative_offset: -; STREAMING-COMPAT: // %bb.0: // %entry -; STREAMING-COMPAT-NEXT: stur b0, [x0, #-8] -; STREAMING-COMPAT-NEXT: ret +; CHECK-LABEL: test_str_trunc_lane0_s32_to_s8_negative_offset: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stur b0, [x0, #-8] +; CHECK-NEXT: ret entry: %0 = extractelement %b, i32 0 From 7abb3421ddfa9d56be2effc4f252698b7b492bd4 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Thu, 3 Apr 2025 12:32:39 +0000 Subject: [PATCH 05/12] Add note --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 7be6ae5603b25..a70f7e8a26471 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1394,6 +1394,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, } } + // v1i64 -> v1i8 truncstore represents a bsub FPR8 store. setTruncStoreAction(MVT::v1i64, MVT::v1i8, Legal); for (auto Op : From 36d54fec45889314b1dd7819c385a9414cf94869 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Fri, 4 Apr 2025 13:53:28 +0000 Subject: [PATCH 06/12] Fixups --- llvm/include/llvm/CodeGen/ValueTypes.td | 2 +- llvm/lib/CodeGen/ValueTypes.cpp | 4 ++-- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 2 +- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 10 ++-------- llvm/lib/Target/AArch64/AArch64RegisterInfo.td | 2 +- llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td | 2 -- 6 files changed, 7 insertions(+), 15 deletions(-) diff --git a/llvm/include/llvm/CodeGen/ValueTypes.td b/llvm/include/llvm/CodeGen/ValueTypes.td index 42c4830e94220..28216a7a55398 100644 --- a/llvm/include/llvm/CodeGen/ValueTypes.td +++ b/llvm/include/llvm/CodeGen/ValueTypes.td @@ -338,7 +338,7 @@ def amdgpuBufferFatPointer : ValueType<160, 234>; // FIXME: Remove this and the getPointerType() override if MVT::i82 is added. def amdgpuBufferStridedPointer : ValueType<192, 235>; -def vi8 : ValueType<8, 236>; // 8-bit integer in FPR (AArch64) +def aarch64mfp8 : ValueType<8, 236>; // 8-bit value in FPR (AArch64) let isNormalValueType = false in { def token : ValueType<0, 504>; // TokenTy diff --git a/llvm/lib/CodeGen/ValueTypes.cpp b/llvm/lib/CodeGen/ValueTypes.cpp index c769568253b12..58adada1b1f3c 100644 --- a/llvm/lib/CodeGen/ValueTypes.cpp +++ b/llvm/lib/CodeGen/ValueTypes.cpp @@ -198,8 +198,8 @@ std::string EVT::getEVTString() const { return "amdgpuBufferFatPointer"; case MVT::amdgpuBufferStridedPointer: return "amdgpuBufferStridedPointer"; - case MVT::vi8: - return "vi8"; + case MVT::aarch64mfp8: + return "aarch64mfp8"; } } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index a70f7e8a26471..978d1f80745f0 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -401,7 +401,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, } if (Subtarget->hasFPARMv8()) { - addRegisterClass(MVT::vi8, &AArch64::FPR8RegClass); + addRegisterClass(MVT::aarch64mfp8, &AArch64::FPR8RegClass); addRegisterClass(MVT::f16, &AArch64::FPR16RegClass); addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass); addRegisterClass(MVT::f32, &AArch64::FPR32RegClass); diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 8b419c4f7cb3a..582946a066714 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -4602,16 +4602,10 @@ def : Pat<(truncstorei8 GPR64:$Rt, (am_unscaled8 GPR64sp:$Rn, simm9:$offset)), (STURBBi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>; // v1i64 -> bsub truncating stores -// Supporting pattern lower f32/64 -> v8i8 -def : Pat<(v8i8 (vector_insert (v8i8 (undef)), (i32 FPR32:$src), 0)), - (INSERT_SUBREG (v8i8 (IMPLICIT_DEF)), FPR32:$src, ssub)>; -def : Pat<(v8i8 (vector_insert (v8i8 (undef)), (i64 FPR64:$src), 0)), - (v8i8 (EXTRACT_SUBREG (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), FPR64:$src, dsub), dsub))>; -// Lower v1i64 -> v1i8 truncstore to bsub store def : Pat<(truncstorevi8 v1i64:$VT, (am_unscaled8 GPR64sp:$Rn, simm9:$offset)), - (STURBi (vi8 (EXTRACT_SUBREG v1i64:$VT, bsub)), GPR64sp:$Rn, simm9:$offset)>; + (STURBi (aarch64mfp8 (EXTRACT_SUBREG v1i64:$VT, bsub)), GPR64sp:$Rn, simm9:$offset)>; def : Pat<(truncstorevi8 v1i64:$VT, (am_indexed8 GPR64sp:$Rn, uimm12s4:$offset)), - (STRBui (vi8 (EXTRACT_SUBREG v1i64:$VT, bsub)), GPR64sp:$Rn, uimm12s4:$offset)>; + (STRBui (aarch64mfp8 (EXTRACT_SUBREG v1i64:$VT, bsub)), GPR64sp:$Rn, uimm12s4:$offset)>; // Match stores from lane 0 to the appropriate subreg's store. multiclass VecStoreULane0Pat, DwarfRegAlias, DwarfRegAlias; } -def FPR8 : RegisterClass<"AArch64", [i8, vi8], 8, (sequence "B%u", 0, 31)> { +def FPR8 : RegisterClass<"AArch64", [i8, aarch64mfp8], 8, (sequence "B%u", 0, 31)> { let Size = 8; let DecoderMethod = "DecodeSimpleRegisterClass"; } diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 579a15eeab339..a2f326c994c2f 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -3206,8 +3206,6 @@ let Predicates = [HasSVE_or_SME] in { // Insert scalar into undef[0] def : Pat<(nxv16i8 (vector_insert (nxv16i8 (undef)), (i32 FPR32:$src), 0)), (INSERT_SUBREG (nxv16i8 (IMPLICIT_DEF)), FPR32:$src, ssub)>; - def : Pat<(nxv16i8 (vector_insert (nxv16i8 (undef)), (i64 FPR64:$src), 0)), - (INSERT_SUBREG (nxv16i8 (IMPLICIT_DEF)), FPR64:$src, dsub)>; def : Pat<(nxv8i16 (vector_insert (nxv8i16 (undef)), (i32 FPR32:$src), 0)), (INSERT_SUBREG (nxv8i16 (IMPLICIT_DEF)), FPR32:$src, ssub)>; def : Pat<(nxv4i32 (vector_insert (nxv4i32 (undef)), (i32 FPR32:$src), 0)), From 6857792eb5e54d19ad4a304161e2021e0f8942c5 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Tue, 8 Apr 2025 19:47:56 +0000 Subject: [PATCH 07/12] Generalize fold --- llvm/lib/CodeGen/ValueTypes.cpp | 2 + .../Target/AArch64/AArch64ISelLowering.cpp | 124 ++++----- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 10 +- .../aarch64-neon-vector-insert-uaddlv.ll | 9 +- .../CodeGen/AArch64/aarch64-sve-ldst-one.ll | 128 +++++++--- llvm/test/CodeGen/AArch64/add.ll | 6 +- llvm/test/CodeGen/AArch64/andorxor.ll | 18 +- .../test/CodeGen/AArch64/arm64-collect-loh.ll | 1 + .../AArch64/arm64-neon-simd-ldst-one.ll | 14 +- llvm/test/CodeGen/AArch64/arm64-rev.ll | 4 +- llvm/test/CodeGen/AArch64/arm64-st1.ll | 140 +++++++--- llvm/test/CodeGen/AArch64/bitcast.ll | 4 +- llvm/test/CodeGen/AArch64/ctlz.ll | 6 +- llvm/test/CodeGen/AArch64/ctpop.ll | 6 +- llvm/test/CodeGen/AArch64/cttz.ll | 6 +- llvm/test/CodeGen/AArch64/dp1.ll | 3 +- llvm/test/CodeGen/AArch64/mul.ll | 6 +- llvm/test/CodeGen/AArch64/neon-rshrn.ll | 2 +- llvm/test/CodeGen/AArch64/neon-truncstore.ll | 2 +- llvm/test/CodeGen/AArch64/sadd_sat_vec.ll | 2 +- llvm/test/CodeGen/AArch64/shufflevector.ll | 4 +- llvm/test/CodeGen/AArch64/ssub_sat_vec.ll | 2 +- llvm/test/CodeGen/AArch64/store.ll | 42 +-- llvm/test/CodeGen/AArch64/sub.ll | 6 +- .../AArch64/sve-fixed-length-permute-rev.ll | 64 +++-- ...-streaming-mode-fixed-length-ld2-alloca.ll | 3 +- llvm/test/CodeGen/AArch64/tbl-loops.ll | 7 +- llvm/test/CodeGen/AArch64/trunc-to-tbl.ll | 28 +- llvm/test/CodeGen/AArch64/uadd_sat_vec.ll | 2 +- llvm/test/CodeGen/AArch64/usub_sat_vec.ll | 2 +- .../AArch64/vec3-loads-ext-trunc-stores.ll | 48 ++-- llvm/test/CodeGen/AArch64/vec_uaddo.ll | 34 +-- llvm/test/CodeGen/AArch64/vec_umulo.ll | 37 ++- llvm/test/CodeGen/AArch64/vector-compress.ll | 62 ++--- llvm/test/CodeGen/AArch64/zext-to-tbl.ll | 239 +++++++++--------- 35 files changed, 586 insertions(+), 487 deletions(-) diff --git a/llvm/lib/CodeGen/ValueTypes.cpp b/llvm/lib/CodeGen/ValueTypes.cpp index 58adada1b1f3c..b6d287295e1dd 100644 --- a/llvm/lib/CodeGen/ValueTypes.cpp +++ b/llvm/lib/CodeGen/ValueTypes.cpp @@ -223,6 +223,8 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const { case MVT::x86mmx: return llvm::FixedVectorType::get(llvm::IntegerType::get(Context, 64), 1); case MVT::aarch64svcount: return TargetExtType::get(Context, "aarch64.svcount"); + case MVT::aarch64mfp8: + return VectorType::get(IntegerType::get(Context, 8), ElementCount::getFixed(1)); case MVT::x86amx: return Type::getX86_AMXTy(Context); case MVT::i64x8: return IntegerType::get(Context, 512); case MVT::amdgpuBufferFatPointer: return IntegerType::get(Context, 160); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 978d1f80745f0..0b8bd93f50471 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1394,9 +1394,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, } } - // v1i64 -> v1i8 truncstore represents a bsub FPR8 store. - setTruncStoreAction(MVT::v1i64, MVT::v1i8, Legal); - for (auto Op : {ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL, ISD::FRINT, ISD::FTRUNC, ISD::FROUND, ISD::FROUNDEVEN, ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE, @@ -23936,6 +23933,8 @@ static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG, static unsigned getFPSubregForVT(EVT VT) { assert(VT.isSimple() && "Expected simple VT"); switch (VT.getSimpleVT().SimpleTy) { + case MVT::aarch64mfp8: + return AArch64::bsub; case MVT::f16: return AArch64::hsub; case MVT::f32: @@ -23947,22 +23946,6 @@ static unsigned getFPSubregForVT(EVT VT) { } } -static EVT get64BitVector(EVT ElVT) { - assert(ElVT.isSimple() && "Expected simple VT"); - switch (ElVT.getSimpleVT().SimpleTy) { - case MVT::i8: - return MVT::v8i8; - case MVT::i16: - return MVT::v4i16; - case MVT::i32: - return MVT::v2i32; - case MVT::i64: - return MVT::v1i64; - default: - llvm_unreachable("Unexpected VT!"); - } -} - static SDValue performSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, @@ -24041,72 +24024,63 @@ static SDValue performSTORECombine(SDNode *N, SDValue ExtIdx = Value.getOperand(1); EVT VectorVT = Vector.getValueType(); EVT ElemVT = VectorVT.getVectorElementType(); + if (!ValueVT.isInteger()) return SDValue(); if (ValueVT != MemVT && !ST->isTruncatingStore()) return SDValue(); - if (MemVT == MVT::i8) { - auto *ExtCst = dyn_cast(ExtIdx); - if (Subtarget->isNeonAvailable() && - (VectorVT == MVT::v8i8 || VectorVT == MVT::v16i8) && ExtCst && - !ExtCst->isZero() && ST->getBasePtr().getOpcode() != ISD::ADD) { - // These can lower to st1.b, which is preferable if we're unlikely to - // fold the addressing into the store. - return SDValue(); - } - - // Lower as truncstore of v1i64 -> v1i8 (which can lower to a bsub store). - SDValue Zero = DAG.getConstant(0, DL, MVT::i64); - SDValue ExtVector; - EVT VecVT64 = get64BitVector(ElemVT); - if (ExtCst && ExtCst->isZero()) { - ExtVector = - DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT64, Vector, Zero); - } else { - SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, - Value.getValueType(), Vector, ExtIdx); - ExtVector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT64, - DAG.getUNDEF(VecVT64), Ext, Zero); - } - - SDValue Cast = DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v1i64, ExtVector); - return DAG.getTruncStore(ST->getChain(), DL, Cast, ST->getBasePtr(), - MVT::v1i8, ST->getMemOperand()); - } - - // TODO: Handle storing i8s to wider types. - if (ElemVT == MVT::i8) + // This could generate an additional extract if the index is non-zero and + // the extracted value has multiple uses. + auto *ExtCst = dyn_cast(ExtIdx); + if ((!ExtCst || !ExtCst->isZero()) && !Value.hasOneUse()) return SDValue(); - // Heuristic: If there are other users of integer scalars extracted from - // this vector that won't fold into the store -- abandon folding. Applying - // this fold may extend the vector lifetime and disrupt paired stores. - for (const auto &Use : Vector->uses()) { - if (Use.getResNo() != Vector.getResNo()) - continue; - const SDNode *User = Use.getUser(); - if (User->getOpcode() == ISD::EXTRACT_VECTOR_ELT && - (!User->hasOneUse() || - (*User->user_begin())->getOpcode() != ISD::STORE)) - return SDValue(); + if (Subtarget->isNeonAvailable() && ElemVT == MemVT && + (VectorVT.is64BitVector() || VectorVT.is128BitVector()) && ExtCst && + !ExtCst->isZero() && ST->getBasePtr().getOpcode() != ISD::ADD) { + // These can lower to st1, which is preferable if we're unlikely to fold + // the addressing into the store. + return SDValue(); } - EVT FPElemVT = EVT::getFloatingPointVT(ElemVT.getSizeInBits()); - EVT FPVectorVT = VectorVT.changeVectorElementType(FPElemVT); - SDValue Cast = DAG.getNode(ISD::BITCAST, DL, FPVectorVT, Vector); - SDValue Ext = - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, FPElemVT, Cast, ExtIdx); + if (MemVT == MVT::i64 || MemVT == MVT::i32) { + // Heuristic: If there are other users of w/x integer scalars extracted + // from this vector that won't fold into the store -- abandon folding. + // Applying this fold may disrupt paired stores. + for (const auto &Use : Vector->uses()) { + if (Use.getResNo() != Vector.getResNo()) + continue; + const SDNode *User = Use.getUser(); + if (User->getOpcode() == ISD::EXTRACT_VECTOR_ELT && + (!User->hasOneUse() || + (*User->user_begin())->getOpcode() != ISD::STORE)) + return SDValue(); + } + } - EVT FPMemVT = EVT::getFloatingPointVT(MemVT.getSizeInBits()); - if (ST->isTruncatingStore() && FPMemVT != FPElemVT) { - SDValue Trunc = DAG.getTargetExtractSubreg(getFPSubregForVT(FPMemVT), DL, - FPMemVT, Ext); - return DAG.getStore(ST->getChain(), DL, Trunc, ST->getBasePtr(), - ST->getMemOperand()); + SDValue ExtVector = Vector; + if (!ExtCst || !ExtCst->isZero()) { + // Handle extracting from lanes != 0. + SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, + Value.getValueType(), Vector, ExtIdx); + // FIXME: Using a fixed-size vector for the insertion should not be + // necessary, but SVE ISEL is missing some folds to avoid fmovs. + SDValue Zero = DAG.getConstant(0, DL, MVT::i64); + EVT InsertVectorVT = EVT::getVectorVT( + *DAG.getContext(), ElemVT, + VectorVT.getVectorElementCount().getKnownMinValue(), false); + ExtVector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, InsertVectorVT, + DAG.getUNDEF(InsertVectorVT), Ext, Zero); } - return DAG.getStore(ST->getChain(), DL, Ext, ST->getBasePtr(), + EVT FPMemVT = MemVT == MVT::i8 + ? MVT::aarch64mfp8 + : EVT::getFloatingPointVT(MemVT.getSizeInBits()); + SDValue FPSubreg = DAG.getTargetExtractSubreg(getFPSubregForVT(FPMemVT), DL, + FPMemVT, ExtVector); + + return DAG.getStore(ST->getChain(), DL, FPSubreg, ST->getBasePtr(), ST->getMemOperand()); } @@ -28861,10 +28835,6 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE( auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT); auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue()); - // Can be lowered to a bsub store in ISEL. - if (VT == MVT::v1i64 && MemVT == MVT::v1i8) - return SDValue(); - if (VT.isFloatingPoint() && Store->isTruncatingStore()) { EVT TruncVT = ContainerVT.changeVectorElementType( Store->getMemoryVT().getVectorElementType()); diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 582946a066714..7ac36e42f56ae 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -4601,11 +4601,11 @@ def : Pat<(truncstorei16 GPR64:$Rt, (am_unscaled16 GPR64sp:$Rn, simm9:$offset)), def : Pat<(truncstorei8 GPR64:$Rt, (am_unscaled8 GPR64sp:$Rn, simm9:$offset)), (STURBBi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>; -// v1i64 -> bsub truncating stores -def : Pat<(truncstorevi8 v1i64:$VT, (am_unscaled8 GPR64sp:$Rn, simm9:$offset)), - (STURBi (aarch64mfp8 (EXTRACT_SUBREG v1i64:$VT, bsub)), GPR64sp:$Rn, simm9:$offset)>; -def : Pat<(truncstorevi8 v1i64:$VT, (am_indexed8 GPR64sp:$Rn, uimm12s4:$offset)), - (STRBui (aarch64mfp8 (EXTRACT_SUBREG v1i64:$VT, bsub)), GPR64sp:$Rn, uimm12s4:$offset)>; +// aarch64mfp8 (bsub) stores +def : Pat<(store aarch64mfp8:$Rt, (am_unscaled8 GPR64sp:$Rn, simm9:$offset)), + (STURBi FPR8:$Rt, GPR64sp:$Rn, simm9:$offset)>; +def : Pat<(store aarch64mfp8:$Rt, (am_indexed8 GPR64sp:$Rn, uimm12s4:$offset)), + (STRBui FPR8:$Rt, GPR64sp:$Rn, uimm12s4:$offset)>; // Match stores from lane 0 to the appropriate subreg's store. multiclass VecStoreULane0Pat %b) { -; CHECK-LABEL: test_str_lane_s32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, z0.s[3] -; CHECK-NEXT: str s0, [x0] -; CHECK-NEXT: ret +; CHECK-NONSTREAMING-LABEL: test_str_lane_s32: +; CHECK-NONSTREAMING: // %bb.0: // %entry +; CHECK-NONSTREAMING-NEXT: mov v0.s[0], v0.s[3] +; CHECK-NONSTREAMING-NEXT: str s0, [x0] +; CHECK-NONSTREAMING-NEXT: ret +; +; STREAMING-COMPAT-LABEL: test_str_lane_s32: +; STREAMING-COMPAT: // %bb.0: // %entry +; STREAMING-COMPAT-NEXT: mov z0.s, z0.s[3] +; STREAMING-COMPAT-NEXT: str s0, [x0] +; STREAMING-COMPAT-NEXT: ret entry: %0 = extractelement %b, i32 3 @@ -31,11 +37,17 @@ entry: } define void @test_str_lane_s64(ptr %a, %b) { -; CHECK-LABEL: test_str_lane_s64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.d, z0.d[1] -; CHECK-NEXT: str d0, [x0] -; CHECK-NEXT: ret +; CHECK-NONSTREAMING-LABEL: test_str_lane_s64: +; CHECK-NONSTREAMING: // %bb.0: // %entry +; CHECK-NONSTREAMING-NEXT: mov v0.d[0], v0.d[1] +; CHECK-NONSTREAMING-NEXT: str d0, [x0] +; CHECK-NONSTREAMING-NEXT: ret +; +; STREAMING-COMPAT-LABEL: test_str_lane_s64: +; STREAMING-COMPAT: // %bb.0: // %entry +; STREAMING-COMPAT-NEXT: mov z0.d, z0.d[1] +; STREAMING-COMPAT-NEXT: str d0, [x0] +; STREAMING-COMPAT-NEXT: ret entry: %0 = extractelement %b, i32 1 @@ -135,11 +147,17 @@ entry: } define void @test_str_lane_s16(ptr %a, %b) { -; CHECK-LABEL: test_str_lane_s16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, z0.h[3] -; CHECK-NEXT: str h0, [x0] -; CHECK-NEXT: ret +; CHECK-NONSTREAMING-LABEL: test_str_lane_s16: +; CHECK-NONSTREAMING: // %bb.0: // %entry +; CHECK-NONSTREAMING-NEXT: mov v0.h[0], v0.h[3] +; CHECK-NONSTREAMING-NEXT: str h0, [x0] +; CHECK-NONSTREAMING-NEXT: ret +; +; STREAMING-COMPAT-LABEL: test_str_lane_s16: +; STREAMING-COMPAT: // %bb.0: // %entry +; STREAMING-COMPAT-NEXT: mov z0.h, z0.h[3] +; STREAMING-COMPAT-NEXT: str h0, [x0] +; STREAMING-COMPAT-NEXT: ret entry: %0 = extractelement %b, i32 3 @@ -266,11 +284,17 @@ define void @test_str_reduction_i32_to_i8_negative_offset(ptr %ptr, %b) { -; CHECK-LABEL: test_str_lane_s32_negative_offset: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, z0.s[3] -; CHECK-NEXT: stur s0, [x0, #-32] -; CHECK-NEXT: ret +; CHECK-NONSTREAMING-LABEL: test_str_lane_s32_negative_offset: +; CHECK-NONSTREAMING: // %bb.0: // %entry +; CHECK-NONSTREAMING-NEXT: mov v0.s[0], v0.s[3] +; CHECK-NONSTREAMING-NEXT: stur s0, [x0, #-32] +; CHECK-NONSTREAMING-NEXT: ret +; +; STREAMING-COMPAT-LABEL: test_str_lane_s32_negative_offset: +; STREAMING-COMPAT: // %bb.0: // %entry +; STREAMING-COMPAT-NEXT: mov z0.s, z0.s[3] +; STREAMING-COMPAT-NEXT: stur s0, [x0, #-32] +; STREAMING-COMPAT-NEXT: ret entry: %0 = extractelement %b, i32 3 @@ -293,11 +317,17 @@ entry: } define void @test_str_lane_s64_negative_offset(ptr %a, %b) { -; CHECK-LABEL: test_str_lane_s64_negative_offset: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.d, z0.d[1] -; CHECK-NEXT: stur d0, [x0, #-64] -; CHECK-NEXT: ret +; CHECK-NONSTREAMING-LABEL: test_str_lane_s64_negative_offset: +; CHECK-NONSTREAMING: // %bb.0: // %entry +; CHECK-NONSTREAMING-NEXT: mov v0.d[0], v0.d[1] +; CHECK-NONSTREAMING-NEXT: stur d0, [x0, #-64] +; CHECK-NONSTREAMING-NEXT: ret +; +; STREAMING-COMPAT-LABEL: test_str_lane_s64_negative_offset: +; STREAMING-COMPAT: // %bb.0: // %entry +; STREAMING-COMPAT-NEXT: mov z0.d, z0.d[1] +; STREAMING-COMPAT-NEXT: stur d0, [x0, #-64] +; STREAMING-COMPAT-NEXT: ret entry: %0 = extractelement %b, i32 1 @@ -351,11 +381,17 @@ entry: } define void @test_str_lane_s16_negative_offset(ptr %a, %b) { -; CHECK-LABEL: test_str_lane_s16_negative_offset: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, z0.h[3] -; CHECK-NEXT: stur h0, [x0, #-16] -; CHECK-NEXT: ret +; CHECK-NONSTREAMING-LABEL: test_str_lane_s16_negative_offset: +; CHECK-NONSTREAMING: // %bb.0: // %entry +; CHECK-NONSTREAMING-NEXT: mov v0.h[0], v0.h[3] +; CHECK-NONSTREAMING-NEXT: stur h0, [x0, #-16] +; CHECK-NONSTREAMING-NEXT: ret +; +; STREAMING-COMPAT-LABEL: test_str_lane_s16_negative_offset: +; STREAMING-COMPAT: // %bb.0: // %entry +; STREAMING-COMPAT-NEXT: mov z0.h, z0.h[3] +; STREAMING-COMPAT-NEXT: stur h0, [x0, #-16] +; STREAMING-COMPAT-NEXT: ret entry: %0 = extractelement %b, i32 3 @@ -378,11 +414,17 @@ entry: } define void @test_str_trunc_lane_s32_to_s16(ptr %a, %b) { -; CHECK-LABEL: test_str_trunc_lane_s32_to_s16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, z0.s[3] -; CHECK-NEXT: str h0, [x0] -; CHECK-NEXT: ret +; CHECK-NONSTREAMING-LABEL: test_str_trunc_lane_s32_to_s16: +; CHECK-NONSTREAMING: // %bb.0: // %entry +; CHECK-NONSTREAMING-NEXT: mov v0.s[0], v0.s[3] +; CHECK-NONSTREAMING-NEXT: str h0, [x0] +; CHECK-NONSTREAMING-NEXT: ret +; +; STREAMING-COMPAT-LABEL: test_str_trunc_lane_s32_to_s16: +; STREAMING-COMPAT: // %bb.0: // %entry +; STREAMING-COMPAT-NEXT: mov z0.s, z0.s[3] +; STREAMING-COMPAT-NEXT: str h0, [x0] +; STREAMING-COMPAT-NEXT: ret entry: %0 = extractelement %b, i32 3 @@ -452,11 +494,17 @@ entry: } define void @test_str_trunc_lane_s32_to_s16_negative_offset(ptr %a, %b) { -; CHECK-LABEL: test_str_trunc_lane_s32_to_s16_negative_offset: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, z0.s[3] -; CHECK-NEXT: stur h0, [x0, #-16] -; CHECK-NEXT: ret +; CHECK-NONSTREAMING-LABEL: test_str_trunc_lane_s32_to_s16_negative_offset: +; CHECK-NONSTREAMING: // %bb.0: // %entry +; CHECK-NONSTREAMING-NEXT: mov v0.s[0], v0.s[3] +; CHECK-NONSTREAMING-NEXT: stur h0, [x0, #-16] +; CHECK-NONSTREAMING-NEXT: ret +; +; STREAMING-COMPAT-LABEL: test_str_trunc_lane_s32_to_s16_negative_offset: +; STREAMING-COMPAT: // %bb.0: // %entry +; STREAMING-COMPAT-NEXT: mov z0.s, z0.s[3] +; STREAMING-COMPAT-NEXT: stur h0, [x0, #-16] +; STREAMING-COMPAT-NEXT: ret entry: %0 = extractelement %b, i32 3 diff --git a/llvm/test/CodeGen/AArch64/add.ll b/llvm/test/CodeGen/AArch64/add.ll index ea5dbc03ca174..44e79fb5e1e37 100644 --- a/llvm/test/CodeGen/AArch64/add.ll +++ b/llvm/test/CodeGen/AArch64/add.ll @@ -231,7 +231,7 @@ define void @v2i16(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8] ; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9] ; CHECK-SD-NEXT: add v0.2s, v0.2s, v1.2s -; CHECK-SD-NEXT: mov s1, v0.s[1] +; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] ; CHECK-SD-NEXT: str h0, [x0] ; CHECK-SD-NEXT: str h1, [x0, #2] ; CHECK-SD-NEXT: ret @@ -262,10 +262,10 @@ define void @v3i16(ptr %p1, ptr %p2) { ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: ldr d0, [x0] ; CHECK-SD-NEXT: ldr d1, [x1] -; CHECK-SD-NEXT: add x8, x0, #4 ; CHECK-SD-NEXT: add v0.4h, v0.4h, v1.4h -; CHECK-SD-NEXT: st1 { v0.h }[2], [x8] +; CHECK-SD-NEXT: mov v1.h[0], v0.h[2] ; CHECK-SD-NEXT: str s0, [x0] +; CHECK-SD-NEXT: str h1, [x0, #4] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: v3i16: diff --git a/llvm/test/CodeGen/AArch64/andorxor.ll b/llvm/test/CodeGen/AArch64/andorxor.ll index 709198d830891..30a7e4aca1c47 100644 --- a/llvm/test/CodeGen/AArch64/andorxor.ll +++ b/llvm/test/CodeGen/AArch64/andorxor.ll @@ -693,7 +693,7 @@ define void @and_v2i16(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8] ; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9] ; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-SD-NEXT: mov s1, v0.s[1] +; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] ; CHECK-SD-NEXT: str h0, [x0] ; CHECK-SD-NEXT: str h1, [x0, #2] ; CHECK-SD-NEXT: ret @@ -729,7 +729,7 @@ define void @or_v2i16(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8] ; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9] ; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b -; CHECK-SD-NEXT: mov s1, v0.s[1] +; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] ; CHECK-SD-NEXT: str h0, [x0] ; CHECK-SD-NEXT: str h1, [x0, #2] ; CHECK-SD-NEXT: ret @@ -765,7 +765,7 @@ define void @xor_v2i16(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8] ; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9] ; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b -; CHECK-SD-NEXT: mov s1, v0.s[1] +; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] ; CHECK-SD-NEXT: str h0, [x0] ; CHECK-SD-NEXT: str h1, [x0, #2] ; CHECK-SD-NEXT: ret @@ -799,8 +799,8 @@ define void @and_v3i16(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: and x8, x8, x9 ; CHECK-SD-NEXT: fmov d0, x8 ; CHECK-SD-NEXT: str w8, [x0] -; CHECK-SD-NEXT: add x8, x0, #4 -; CHECK-SD-NEXT: st1 { v0.h }[2], [x8] +; CHECK-SD-NEXT: mov v0.h[0], v0.h[2] +; CHECK-SD-NEXT: str h0, [x0, #4] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: and_v3i16: @@ -836,8 +836,8 @@ define void @or_v3i16(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: orr x8, x8, x9 ; CHECK-SD-NEXT: fmov d0, x8 ; CHECK-SD-NEXT: str w8, [x0] -; CHECK-SD-NEXT: add x8, x0, #4 -; CHECK-SD-NEXT: st1 { v0.h }[2], [x8] +; CHECK-SD-NEXT: mov v0.h[0], v0.h[2] +; CHECK-SD-NEXT: str h0, [x0, #4] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: or_v3i16: @@ -873,8 +873,8 @@ define void @xor_v3i16(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: eor x8, x8, x9 ; CHECK-SD-NEXT: fmov d0, x8 ; CHECK-SD-NEXT: str w8, [x0] -; CHECK-SD-NEXT: add x8, x0, #4 -; CHECK-SD-NEXT: st1 { v0.h }[2], [x8] +; CHECK-SD-NEXT: mov v0.h[0], v0.h[2] +; CHECK-SD-NEXT: str h0, [x0, #4] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: xor_v3i16: diff --git a/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll b/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll index 246fbbdb80715..7f2bebf584d8f 100644 --- a/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll +++ b/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll @@ -612,6 +612,7 @@ define <1 x i8> @getL() { ; CHECK-LABEL: _setL ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]: ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _L@GOTPAGE +; CHECK-NEXT: ; kill ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]: ; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], [[[ADRP_REG]], _L@GOTPAGEOFF] ; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]: diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll b/llvm/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll index 7d87be0ce8e1c..7721616be436c 100644 --- a/llvm/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll @@ -674,16 +674,10 @@ entry: } define void @test_vst1_lane_s64(ptr %a, <1 x i64> %b) { -; CHECK-GI-LABEL: test_vst1_lane_s64: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: str d0, [x0] -; CHECK-GI-NEXT: ret -; -; CHECK-SD-LABEL: test_vst1_lane_s64: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: str d0, [x0] -; CHECK-SD-NEXT: ret +; CHECK-LABEL: test_vst1_lane_s64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str d0, [x0] +; CHECK-NEXT: ret entry: %0 = extractelement <1 x i64> %b, i32 0 store i64 %0, ptr %a, align 8 diff --git a/llvm/test/CodeGen/AArch64/arm64-rev.ll b/llvm/test/CodeGen/AArch64/arm64-rev.ll index 14ab7b5108125..2006e5af547c1 100644 --- a/llvm/test/CodeGen/AArch64/arm64-rev.ll +++ b/llvm/test/CodeGen/AArch64/arm64-rev.ll @@ -462,9 +462,9 @@ define void @test_vrev64(ptr nocapture %source, ptr nocapture %dst) nounwind ssp ; CHECK-SD-LABEL: test_vrev64: ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: ldr q0, [x0] -; CHECK-SD-NEXT: add x8, x1, #2 -; CHECK-SD-NEXT: st1.h { v0 }[5], [x8] +; CHECK-SD-NEXT: mov.h v1[0], v0[5] ; CHECK-SD-NEXT: st1.h { v0 }[6], [x1] +; CHECK-SD-NEXT: str h1, [x1, #2] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: test_vrev64: diff --git a/llvm/test/CodeGen/AArch64/arm64-st1.ll b/llvm/test/CodeGen/AArch64/arm64-st1.ll index b37b952aa5b03..a4bf5c6e4d5b3 100644 --- a/llvm/test/CodeGen/AArch64/arm64-st1.ll +++ b/llvm/test/CodeGen/AArch64/arm64-st1.ll @@ -90,9 +90,17 @@ define void @st1lane0_ro_16b(<16 x i8> %A, ptr %D, i64 %offset) { } define void @st1lane_8h(<8 x i16> %A, ptr %D) { -; CHECK-LABEL: st1lane_8h: -; CHECK: add x8, x0, #2 -; CHECK: st1.h { v0 }[1], [x8] +; SD-CHECK-LABEL: st1lane_8h: +; SD-CHECK: mov.h v0[0], v0[1] +; SD-CHECK: str h0, [x0, #2] +; +; GI-CHECK-LABEL: st1lane_8h: +; GI-CHECK: add x8, x0, #2 +; GI-CHECK: st1.h { v0 }[1], [x8] +; +; EXYNOS-LABEL: st1lane_8h: +; EXYNOS: mov.h v0[0], v0[1] +; EXYNOS: str h0, [x0, #2] %ptr = getelementptr i16, ptr %D, i64 1 %tmp = extractelement <8 x i16> %A, i32 1 store i16 %tmp, ptr %ptr @@ -118,9 +126,17 @@ define void @st1lane0u_8h(<8 x i16> %A, ptr %D) { } define void @st1lane_ro_8h(<8 x i16> %A, ptr %D, i64 %offset) { -; CHECK-LABEL: st1lane_ro_8h: -; CHECK: add x8, x0, x1, lsl #1 -; CHECK: st1.h { v0 }[1], [x8] +; SD-CHECK-LABEL: st1lane_ro_8h: +; SD-CHECK: mov.h v0[0], v0[1] +; SD-CHECK: str h0, [x0, x1, lsl #1] +; +; GI-CHECK-LABEL: st1lane_ro_8h: +; GI-CHECK: add x8, x0, x1, lsl #1 +; GI-CHECK: st1.h { v0 }[1], [x8] +; +; EXYNOS-LABEL: st1lane_ro_8h: +; EXYNOS: mov.h v0[0], v0[1] +; EXYNOS: str h0, [x0, x1, lsl #1] %ptr = getelementptr i16, ptr %D, i64 %offset %tmp = extractelement <8 x i16> %A, i32 1 store i16 %tmp, ptr %ptr @@ -137,9 +153,17 @@ define void @st1lane0_ro_8h(<8 x i16> %A, ptr %D, i64 %offset) { } define void @st1lane_4s(<4 x i32> %A, ptr %D) { -; CHECK-LABEL: st1lane_4s: -; CHECK: add x8, x0, #4 -; CHECK: st1.s { v0 }[1], [x8] +; SD-CHECK-LABEL: st1lane_4s: +; SD-CHECK: mov.s v0[0], v0[1] +; SD-CHECK: str s0, [x0, #4] +; +; GI-CHECK-LABEL: st1lane_4s: +; GI-CHECK: add x8, x0, #4 +; GI-CHECK: st1.s { v0 }[1], [x8] +; +; EXYNOS-LABEL: st1lane_4s: +; EXYNOS: mov.s v0[0], v0[1] +; EXYNOS: str s0, [x0, #4] %ptr = getelementptr i32, ptr %D, i64 1 %tmp = extractelement <4 x i32> %A, i32 1 store i32 %tmp, ptr %ptr @@ -165,9 +189,17 @@ define void @st1lane0u_4s(<4 x i32> %A, ptr %D) { } define void @st1lane_ro_4s(<4 x i32> %A, ptr %D, i64 %offset) { -; CHECK-LABEL: st1lane_ro_4s: -; CHECK: add x8, x0, x1, lsl #2 -; CHECK: st1.s { v0 }[1], [x8] +; SD-CHECK-LABEL: st1lane_ro_4s: +; SD-CHECK: mov.s v0[0], v0[1] +; SD-CHECK: str s0, [x0, x1, lsl #2] +; +; GI-CHECK-LABEL: st1lane_ro_4s: +; GI-CHECK: add x8, x0, x1, lsl #2 +; GI-CHECK: st1.s { v0 }[1], [x8] +; +; EXYNOS-LABEL: st1lane_ro_4s: +; EXYNOS: mov.s v0[0], v0[1] +; EXYNOS: str s0, [x0, x1, lsl #2] %ptr = getelementptr i32, ptr %D, i64 %offset %tmp = extractelement <4 x i32> %A, i32 1 store i32 %tmp, ptr %ptr @@ -231,9 +263,17 @@ define void @st1lane0_ro_4s_float(<4 x float> %A, ptr %D, i64 %offset) { } define void @st1lane_2d(<2 x i64> %A, ptr %D) { -; CHECK-LABEL: st1lane_2d: -; CHECK: add x8, x0, #8 -; CHECK: st1.d { v0 }[1], [x8] +; SD-CHECK-LABEL: st1lane_2d: +; SD-CHECK: mov.d v0[0], v0[1] +; SD-CHECK: str d0, [x0, #8] +; +; GI-CHECK-LABEL: st1lane_2d: +; GI-CHECK: add x8, x0, #8 +; GI-CHECK: st1.d { v0 }[1], [x8] +; +; EXYNOS-LABEL: st1lane_2d: +; EXYNOS: mov.d v0[0], v0[1] +; EXYNOS: str d0, [x0, #8] %ptr = getelementptr i64, ptr %D, i64 1 %tmp = extractelement <2 x i64> %A, i32 1 store i64 %tmp, ptr %ptr @@ -259,9 +299,17 @@ define void @st1lane0u_2d(<2 x i64> %A, ptr %D) { } define void @st1lane_ro_2d(<2 x i64> %A, ptr %D, i64 %offset) { -; CHECK-LABEL: st1lane_ro_2d: -; CHECK: add x8, x0, x1, lsl #3 -; CHECK: st1.d { v0 }[1], [x8] +; SD-CHECK-LABEL: st1lane_ro_2d: +; SD-CHECK: mov.d v0[0], v0[1] +; SD-CHECK: str d0, [x0, x1, lsl #3] +; +; GI-CHECK-LABEL: st1lane_ro_2d: +; GI-CHECK: add x8, x0, x1, lsl #3 +; GI-CHECK: st1.d { v0 }[1], [x8] +; +; EXYNOS-LABEL: st1lane_ro_2d: +; EXYNOS: mov.d v0[0], v0[1] +; EXYNOS: str d0, [x0, x1, lsl #3] %ptr = getelementptr i64, ptr %D, i64 %offset %tmp = extractelement <2 x i64> %A, i32 1 store i64 %tmp, ptr %ptr @@ -377,9 +425,17 @@ define void @st1lane0_ro_8b(<8 x i8> %A, ptr %D, i64 %offset) { } define void @st1lane_4h(<4 x i16> %A, ptr %D) { -; CHECK-LABEL: st1lane_4h: -; CHECK: add x8, x0, #2 -; CHECK: st1.h { v0 }[1], [x8] +; SD-CHECK-LABEL: st1lane_4h: +; SD-CHECK: mov.h v0[0], v0[1] +; SD-CHECK: str h0, [x0, #2] +; +; GI-CHECK-LABEL: st1lane_4h: +; GI-CHECK: add x8, x0, #2 +; GI-CHECK: st1.h { v0 }[1], [x8] +; +; EXYNOS-LABEL: st1lane_4h: +; EXYNOS: mov.h v0[0], v0[1] +; EXYNOS: str h0, [x0, #2] %ptr = getelementptr i16, ptr %D, i64 1 %tmp = extractelement <4 x i16> %A, i32 1 store i16 %tmp, ptr %ptr @@ -405,9 +461,17 @@ define void @st1lane0u_4h(<4 x i16> %A, ptr %D) { } define void @st1lane_ro_4h(<4 x i16> %A, ptr %D, i64 %offset) { -; CHECK-LABEL: st1lane_ro_4h: -; CHECK: add x8, x0, x1, lsl #1 -; CHECK: st1.h { v0 }[1], [x8] +; SD-CHECK-LABEL: st1lane_ro_4h: +; SD-CHECK: mov.h v0[0], v0[1] +; SD-CHECK: str h0, [x0, x1, lsl #1] +; +; GI-CHECK-LABEL: st1lane_ro_4h: +; GI-CHECK: add x8, x0, x1, lsl #1 +; GI-CHECK: st1.h { v0 }[1], [x8] +; +; EXYNOS-LABEL: st1lane_ro_4h: +; EXYNOS: mov.h v0[0], v0[1] +; EXYNOS: str h0, [x0, x1, lsl #1] %ptr = getelementptr i16, ptr %D, i64 %offset %tmp = extractelement <4 x i16> %A, i32 1 store i16 %tmp, ptr %ptr @@ -424,9 +488,17 @@ define void @st1lane0_ro_4h(<4 x i16> %A, ptr %D, i64 %offset) { } define void @st1lane_2s(<2 x i32> %A, ptr %D) { -; CHECK-LABEL: st1lane_2s: -; CHECK: add x8, x0, #4 -; CHECK: st1.s { v0 }[1], [x8] +; SD-CHECK-LABEL: st1lane_2s: +; SD-CHECK: mov.s v0[0], v0[1] +; SD-CHECK: str s0, [x0, #4] +; +; GI-CHECK-LABEL: st1lane_2s: +; GI-CHECK: add x8, x0, #4 +; GI-CHECK: st1.s { v0 }[1], [x8] +; +; EXYNOS-LABEL: st1lane_2s: +; EXYNOS: mov.s v0[0], v0[1] +; EXYNOS: str s0, [x0, #4] %ptr = getelementptr i32, ptr %D, i64 1 %tmp = extractelement <2 x i32> %A, i32 1 store i32 %tmp, ptr %ptr @@ -452,9 +524,17 @@ define void @st1lane0u_2s(<2 x i32> %A, ptr %D) { } define void @st1lane_ro_2s(<2 x i32> %A, ptr %D, i64 %offset) { -; CHECK-LABEL: st1lane_ro_2s: -; CHECK: add x8, x0, x1, lsl #2 -; CHECK: st1.s { v0 }[1], [x8] +; SD-CHECK-LABEL: st1lane_ro_2s: +; SD-CHECK: mov.s v0[0], v0[1] +; SD-CHECK: str s0, [x0, x1, lsl #2] +; +; GI-CHECK-LABEL: st1lane_ro_2s: +; GI-CHECK: add x8, x0, x1, lsl #2 +; GI-CHECK: st1.s { v0 }[1], [x8] +; +; EXYNOS-LABEL: st1lane_ro_2s: +; EXYNOS: mov.s v0[0], v0[1] +; EXYNOS: str s0, [x0, x1, lsl #2] %ptr = getelementptr i32, ptr %D, i64 %offset %tmp = extractelement <2 x i32> %A, i32 1 store i32 %tmp, ptr %ptr diff --git a/llvm/test/CodeGen/AArch64/bitcast.ll b/llvm/test/CodeGen/AArch64/bitcast.ll index d54cc4adb81b3..adda3b08af66c 100644 --- a/llvm/test/CodeGen/AArch64/bitcast.ll +++ b/llvm/test/CodeGen/AArch64/bitcast.ll @@ -102,7 +102,7 @@ define i32 @bitcast_v2i16_i32(<2 x i16> %a, <2 x i16> %b){ ; CHECK-SD-NEXT: sub sp, sp, #16 ; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 ; CHECK-SD-NEXT: add v0.2s, v0.2s, v1.2s -; CHECK-SD-NEXT: mov s1, v0.s[1] +; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] ; CHECK-SD-NEXT: str h0, [sp, #12] ; CHECK-SD-NEXT: str h1, [sp, #14] ; CHECK-SD-NEXT: ldr w0, [sp, #12] @@ -399,7 +399,7 @@ define <4 x i8> @bitcast_v2i16_v4i8(<2 x i16> %a, <2 x i16> %b){ ; CHECK-SD-NEXT: sub sp, sp, #16 ; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 ; CHECK-SD-NEXT: add v0.2s, v0.2s, v1.2s -; CHECK-SD-NEXT: mov s1, v0.s[1] +; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] ; CHECK-SD-NEXT: str h0, [sp, #12] ; CHECK-SD-NEXT: str h1, [sp, #14] ; CHECK-SD-NEXT: ldr s0, [sp, #12] diff --git a/llvm/test/CodeGen/AArch64/ctlz.ll b/llvm/test/CodeGen/AArch64/ctlz.ll index f795050e568e6..f941ecf508055 100644 --- a/llvm/test/CodeGen/AArch64/ctlz.ll +++ b/llvm/test/CodeGen/AArch64/ctlz.ll @@ -153,7 +153,7 @@ define void @v2i16(ptr %p1) { ; CHECK-SD-NEXT: mov v1.s[1], w9 ; CHECK-SD-NEXT: clz v1.2s, v1.2s ; CHECK-SD-NEXT: sub v0.2s, v1.2s, v0.2s -; CHECK-SD-NEXT: mov s1, v0.s[1] +; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] ; CHECK-SD-NEXT: str h0, [x0] ; CHECK-SD-NEXT: str h1, [x0, #2] ; CHECK-SD-NEXT: ret @@ -178,10 +178,10 @@ define void @v3i16(ptr %p1) { ; CHECK-SD-LABEL: v3i16: ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: ldr d0, [x0] -; CHECK-SD-NEXT: add x8, x0, #4 ; CHECK-SD-NEXT: clz v0.4h, v0.4h -; CHECK-SD-NEXT: st1 { v0.h }[2], [x8] +; CHECK-SD-NEXT: mov v1.h[0], v0.h[2] ; CHECK-SD-NEXT: str s0, [x0] +; CHECK-SD-NEXT: str h1, [x0, #4] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: v3i16: diff --git a/llvm/test/CodeGen/AArch64/ctpop.ll b/llvm/test/CodeGen/AArch64/ctpop.ll index d9cbac7a4c691..b9671114508db 100644 --- a/llvm/test/CodeGen/AArch64/ctpop.ll +++ b/llvm/test/CodeGen/AArch64/ctpop.ll @@ -151,7 +151,7 @@ define void @v2i16(ptr %p1) { ; CHECK-SD-NEXT: cnt v0.8b, v0.8b ; CHECK-SD-NEXT: uaddlp v0.4h, v0.8b ; CHECK-SD-NEXT: uaddlp v0.2s, v0.4h -; CHECK-SD-NEXT: mov s1, v0.s[1] +; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] ; CHECK-SD-NEXT: str h0, [x0] ; CHECK-SD-NEXT: str h1, [x0, #2] ; CHECK-SD-NEXT: ret @@ -177,11 +177,11 @@ define void @v3i16(ptr %p1) { ; CHECK-SD-LABEL: v3i16: ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: ldr d0, [x0] -; CHECK-SD-NEXT: add x8, x0, #4 ; CHECK-SD-NEXT: cnt v0.8b, v0.8b ; CHECK-SD-NEXT: uaddlp v0.4h, v0.8b -; CHECK-SD-NEXT: st1 { v0.h }[2], [x8] +; CHECK-SD-NEXT: mov v1.h[0], v0.h[2] ; CHECK-SD-NEXT: str s0, [x0] +; CHECK-SD-NEXT: str h1, [x0, #4] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: v3i16: diff --git a/llvm/test/CodeGen/AArch64/cttz.ll b/llvm/test/CodeGen/AArch64/cttz.ll index 1d9af77eb4a05..03e89c04b184f 100644 --- a/llvm/test/CodeGen/AArch64/cttz.ll +++ b/llvm/test/CodeGen/AArch64/cttz.ll @@ -227,7 +227,7 @@ define void @v2i16(ptr %p1) { ; CHECK-SD-NEXT: movi v1.2s, #32 ; CHECK-SD-NEXT: clz v0.2s, v0.2s ; CHECK-SD-NEXT: sub v0.2s, v1.2s, v0.2s -; CHECK-SD-NEXT: mov s1, v0.s[1] +; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] ; CHECK-SD-NEXT: str h0, [x0] ; CHECK-SD-NEXT: str h1, [x0, #2] ; CHECK-SD-NEXT: ret @@ -262,14 +262,14 @@ define void @v3i16(ptr %p1) { ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: movi v0.4h, #1 ; CHECK-SD-NEXT: ldr d1, [x0] -; CHECK-SD-NEXT: add x8, x0, #4 ; CHECK-SD-NEXT: sub v0.4h, v1.4h, v0.4h ; CHECK-SD-NEXT: bic v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: movi v1.4h, #16 ; CHECK-SD-NEXT: clz v0.4h, v0.4h ; CHECK-SD-NEXT: sub v0.4h, v1.4h, v0.4h -; CHECK-SD-NEXT: st1 { v0.h }[2], [x8] +; CHECK-SD-NEXT: mov v1.h[0], v0.h[2] ; CHECK-SD-NEXT: str s0, [x0] +; CHECK-SD-NEXT: str h1, [x0, #4] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: v3i16: diff --git a/llvm/test/CodeGen/AArch64/dp1.ll b/llvm/test/CodeGen/AArch64/dp1.ll index 4f48aac72ebc3..e904f4b6d247a 100644 --- a/llvm/test/CodeGen/AArch64/dp1.ll +++ b/llvm/test/CodeGen/AArch64/dp1.ll @@ -205,8 +205,7 @@ define void @ctpop_i32() { ; CHECK-SDAG-NEXT: fmov d0, x9 ; CHECK-SDAG-NEXT: cnt v0.8b, v0.8b ; CHECK-SDAG-NEXT: addv b0, v0.8b -; CHECK-SDAG-NEXT: fmov w9, s0 -; CHECK-SDAG-NEXT: str w9, [x8] +; CHECK-SDAG-NEXT: str s0, [x8] ; CHECK-SDAG-NEXT: ret ; ; CHECK-GISEL-LABEL: ctpop_i32: diff --git a/llvm/test/CodeGen/AArch64/mul.ll b/llvm/test/CodeGen/AArch64/mul.ll index 0270083ad1d06..ae607ffb56c3e 100644 --- a/llvm/test/CodeGen/AArch64/mul.ll +++ b/llvm/test/CodeGen/AArch64/mul.ll @@ -243,7 +243,7 @@ define void @v2i16(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8] ; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9] ; CHECK-SD-NEXT: mul v0.2s, v0.2s, v1.2s -; CHECK-SD-NEXT: mov s1, v0.s[1] +; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] ; CHECK-SD-NEXT: str h0, [x0] ; CHECK-SD-NEXT: str h1, [x0, #2] ; CHECK-SD-NEXT: ret @@ -274,10 +274,10 @@ define void @v3i16(ptr %p1, ptr %p2) { ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: ldr d0, [x0] ; CHECK-SD-NEXT: ldr d1, [x1] -; CHECK-SD-NEXT: add x8, x0, #4 ; CHECK-SD-NEXT: mul v0.4h, v0.4h, v1.4h -; CHECK-SD-NEXT: st1 { v0.h }[2], [x8] +; CHECK-SD-NEXT: mov v1.h[0], v0.h[2] ; CHECK-SD-NEXT: str s0, [x0] +; CHECK-SD-NEXT: str h1, [x0, #4] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: v3i16: diff --git a/llvm/test/CodeGen/AArch64/neon-rshrn.ll b/llvm/test/CodeGen/AArch64/neon-rshrn.ll index 8fabd7a618f68..cbfa3f82f21b1 100644 --- a/llvm/test/CodeGen/AArch64/neon-rshrn.ll +++ b/llvm/test/CodeGen/AArch64/neon-rshrn.ll @@ -868,7 +868,7 @@ define void @rshrn_v2i32_4(<2 x i32> %a, ptr %p) { ; CHECK-NEXT: movi v1.2s, #8 ; CHECK-NEXT: add v0.2s, v0.2s, v1.2s ; CHECK-NEXT: ushr v0.2s, v0.2s, #4 -; CHECK-NEXT: mov s1, v0.s[1] +; CHECK-NEXT: mov v1.s[0], v0.s[1] ; CHECK-NEXT: str h0, [x0] ; CHECK-NEXT: str h1, [x0, #2] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/neon-truncstore.ll b/llvm/test/CodeGen/AArch64/neon-truncstore.ll index c501faa1c567a..86808ae7f9349 100644 --- a/llvm/test/CodeGen/AArch64/neon-truncstore.ll +++ b/llvm/test/CodeGen/AArch64/neon-truncstore.ll @@ -42,7 +42,7 @@ define void @v2i32_v2i16(<2 x i32> %a, ptr %result) { ; CHECK-LABEL: v2i32_v2i16: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov s1, v0.s[1] +; CHECK-NEXT: mov v1.s[0], v0.s[1] ; CHECK-NEXT: str h0, [x0] ; CHECK-NEXT: str h1, [x0, #2] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll index 18457d2b27781..0a47ced6c05f0 100644 --- a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll @@ -255,7 +255,7 @@ define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-SD-NEXT: shl v0.2s, v0.2s, #16 ; CHECK-SD-NEXT: sqadd v0.2s, v0.2s, v1.2s ; CHECK-SD-NEXT: ushr v0.2s, v0.2s, #16 -; CHECK-SD-NEXT: mov s1, v0.s[1] +; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] ; CHECK-SD-NEXT: str h0, [x2] ; CHECK-SD-NEXT: str h1, [x2, #2] ; CHECK-SD-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/shufflevector.ll b/llvm/test/CodeGen/AArch64/shufflevector.ll index 4c8f0c9c446f5..6bfb89fe541c8 100644 --- a/llvm/test/CodeGen/AArch64/shufflevector.ll +++ b/llvm/test/CodeGen/AArch64/shufflevector.ll @@ -288,7 +288,7 @@ define i32 @shufflevector_v2i16(<2 x i16> %a, <2 x i16> %b){ ; CHECK-SD-NEXT: sub sp, sp, #16 ; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 ; CHECK-SD-NEXT: ext v0.8b, v0.8b, v1.8b, #4 -; CHECK-SD-NEXT: mov s1, v0.s[1] +; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] ; CHECK-SD-NEXT: str h0, [sp, #12] ; CHECK-SD-NEXT: str h1, [sp, #14] ; CHECK-SD-NEXT: ldr w0, [sp, #12] @@ -499,7 +499,7 @@ define i32 @shufflevector_v2i16_zeroes(<2 x i16> %a, <2 x i16> %b){ ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-SD-NEXT: dup v1.2s, v0.s[0] ; CHECK-SD-NEXT: str h0, [sp, #12] -; CHECK-SD-NEXT: mov s1, v1.s[1] +; CHECK-SD-NEXT: mov v1.s[0], v1.s[1] ; CHECK-SD-NEXT: str h1, [sp, #14] ; CHECK-SD-NEXT: ldr w0, [sp, #12] ; CHECK-SD-NEXT: add sp, sp, #16 diff --git a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll index 257d2a1c1ebda..6c7f458e345ca 100644 --- a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll @@ -256,7 +256,7 @@ define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-SD-NEXT: shl v0.2s, v0.2s, #16 ; CHECK-SD-NEXT: sqsub v0.2s, v0.2s, v1.2s ; CHECK-SD-NEXT: ushr v0.2s, v0.2s, #16 -; CHECK-SD-NEXT: mov s1, v0.s[1] +; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] ; CHECK-SD-NEXT: str h0, [x2] ; CHECK-SD-NEXT: str h1, [x2, #2] ; CHECK-SD-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/store.ll b/llvm/test/CodeGen/AArch64/store.ll index 296b860be2a76..bc4341b8058ef 100644 --- a/llvm/test/CodeGen/AArch64/store.ll +++ b/llvm/test/CodeGen/AArch64/store.ll @@ -146,13 +146,21 @@ define void @store_v32i8(<32 x i8> %a, ptr %ptr){ } define void @store_v2i16(<2 x i16> %a, ptr %ptr){ -; CHECK-LABEL: store_v2i16: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov s1, v0.s[1] -; CHECK-NEXT: str h0, [x0] -; CHECK-NEXT: str h1, [x0, #2] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: store_v2i16: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] +; CHECK-SD-NEXT: str h0, [x0] +; CHECK-SD-NEXT: str h1, [x0, #2] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: store_v2i16: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: mov s1, v0.s[1] +; CHECK-GI-NEXT: str h0, [x0] +; CHECK-GI-NEXT: str h1, [x0, #2] +; CHECK-GI-NEXT: ret store <2 x i16> %a, ptr %ptr ret void } @@ -232,10 +240,10 @@ define void @store_v7i8(<7 x i8> %a, ptr %ptr){ ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-SD-NEXT: mov v1.b[0], v0.b[6] -; CHECK-SD-NEXT: add x8, x0, #4 +; CHECK-SD-NEXT: mov v2.h[0], v0.h[2] ; CHECK-SD-NEXT: str s0, [x0] -; CHECK-SD-NEXT: st1 { v0.h }[2], [x8] ; CHECK-SD-NEXT: stur b1, [x0, #6] +; CHECK-SD-NEXT: str h2, [x0, #4] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: store_v7i8: @@ -262,10 +270,10 @@ define void @store_v7i8(<7 x i8> %a, ptr %ptr){ define void @store_v3i16(<3 x i16> %a, ptr %ptr){ ; CHECK-SD-LABEL: store_v3i16: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: add x8, x0, #4 ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: mov v1.h[0], v0.h[2] ; CHECK-SD-NEXT: str s0, [x0] -; CHECK-SD-NEXT: st1 { v0.h }[2], [x8] +; CHECK-SD-NEXT: str h1, [x0, #4] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: store_v3i16: @@ -284,11 +292,11 @@ define void @store_v3i16(<3 x i16> %a, ptr %ptr){ define void @store_v7i16(<7 x i16> %a, ptr %ptr){ ; CHECK-SD-LABEL: store_v7i16: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: add x8, x0, #12 -; CHECK-SD-NEXT: add x9, x0, #8 +; CHECK-SD-NEXT: mov v1.h[0], v0.h[6] +; CHECK-SD-NEXT: mov v2.s[0], v0.s[2] ; CHECK-SD-NEXT: str d0, [x0] -; CHECK-SD-NEXT: st1 { v0.h }[6], [x8] -; CHECK-SD-NEXT: st1 { v0.s }[2], [x9] +; CHECK-SD-NEXT: str h1, [x0, #12] +; CHECK-SD-NEXT: str s2, [x0, #8] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: store_v7i16: @@ -314,9 +322,9 @@ define void @store_v7i16(<7 x i16> %a, ptr %ptr){ define void @store_v3i32(<3 x i32> %a, ptr %ptr){ ; CHECK-SD-LABEL: store_v3i32: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: add x8, x0, #8 +; CHECK-SD-NEXT: mov v1.s[0], v0.s[2] ; CHECK-SD-NEXT: str d0, [x0] -; CHECK-SD-NEXT: st1 { v0.s }[2], [x8] +; CHECK-SD-NEXT: str s1, [x0, #8] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: store_v3i32: diff --git a/llvm/test/CodeGen/AArch64/sub.ll b/llvm/test/CodeGen/AArch64/sub.ll index c3cc6169f3969..f482668be311a 100644 --- a/llvm/test/CodeGen/AArch64/sub.ll +++ b/llvm/test/CodeGen/AArch64/sub.ll @@ -231,7 +231,7 @@ define void @v2i16(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8] ; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9] ; CHECK-SD-NEXT: sub v0.2s, v0.2s, v1.2s -; CHECK-SD-NEXT: mov s1, v0.s[1] +; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] ; CHECK-SD-NEXT: str h0, [x0] ; CHECK-SD-NEXT: str h1, [x0, #2] ; CHECK-SD-NEXT: ret @@ -262,10 +262,10 @@ define void @v3i16(ptr %p1, ptr %p2) { ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: ldr d0, [x0] ; CHECK-SD-NEXT: ldr d1, [x1] -; CHECK-SD-NEXT: add x8, x0, #4 ; CHECK-SD-NEXT: sub v0.4h, v0.4h, v1.4h -; CHECK-SD-NEXT: st1 { v0.h }[2], [x8] +; CHECK-SD-NEXT: mov v1.h[0], v0.h[2] ; CHECK-SD-NEXT: str s0, [x0] +; CHECK-SD-NEXT: str h1, [x0, #4] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: v3i16: diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll index 797f953591b11..e8ab228db4279 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll @@ -376,43 +376,39 @@ define void @test_revv8i16v8i16(ptr %a, ptr %b, ptr %c) #1 { ; CHECK-NEXT: .cfi_def_cfa w29, 16 ; CHECK-NEXT: .cfi_offset w30, -8 ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: mov x8, sp ; CHECK-NEXT: ldr q0, [x1] -; CHECK-NEXT: ldr q1, [x0] -; CHECK-NEXT: orr x9, x8, #0x1e -; CHECK-NEXT: orr x10, x8, #0x1c +; CHECK-NEXT: ldr q5, [x0] +; CHECK-NEXT: mov x8, sp ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: st1 { v0.h }[4], [x9] -; CHECK-NEXT: orr x9, x8, #0x18 -; CHECK-NEXT: st1 { v0.h }[7], [x9] -; CHECK-NEXT: orr x9, x8, #0xe -; CHECK-NEXT: st1 { v1.h }[4], [x9] -; CHECK-NEXT: orr x9, x8, #0xc -; CHECK-NEXT: st1 { v1.h }[5], [x9] -; CHECK-NEXT: orr x9, x8, #0x8 -; CHECK-NEXT: st1 { v0.h }[5], [x10] -; CHECK-NEXT: orr x10, x8, #0x10 -; CHECK-NEXT: st1 { v1.h }[7], [x9] -; CHECK-NEXT: orr x9, x8, #0x4 -; CHECK-NEXT: st1 { v0.h }[3], [x10] -; CHECK-NEXT: mov w10, #26 // =0x1a -; CHECK-NEXT: st1 { v1.h }[1], [x9] -; CHECK-NEXT: orr x9, x8, #0x2 -; CHECK-NEXT: st1 { v1.h }[2], [x9] -; CHECK-NEXT: orr x9, x8, x10 -; CHECK-NEXT: mov w10, #20 // =0x14 -; CHECK-NEXT: st1 { v0.h }[6], [x9] -; CHECK-NEXT: orr x9, x8, x10 -; CHECK-NEXT: mov w10, #18 // =0x12 -; CHECK-NEXT: st1 { v0.h }[1], [x9] -; CHECK-NEXT: orr x9, x8, x10 -; CHECK-NEXT: st1 { v0.h }[2], [x9] -; CHECK-NEXT: mov w9, #10 // =0xa -; CHECK-NEXT: orr x9, x8, x9 -; CHECK-NEXT: st1 { v1.h }[3], [x8] -; CHECK-NEXT: st1 { v1.h }[6], [x9] +; CHECK-NEXT: mov v1.h[0], v0.h[4] +; CHECK-NEXT: mov v2.h[0], v0.h[5] +; CHECK-NEXT: mov v3.h[0], v0.h[6] +; CHECK-NEXT: mov v4.h[0], v0.h[7] ; CHECK-NEXT: str h0, [sp, #22] -; CHECK-NEXT: str h1, [sp, #6] +; CHECK-NEXT: st1 { v5.h }[3], [x8] +; CHECK-NEXT: str h5, [sp, #6] +; CHECK-NEXT: str h1, [sp, #30] +; CHECK-NEXT: mov v1.h[0], v0.h[1] +; CHECK-NEXT: str h2, [sp, #28] +; CHECK-NEXT: mov v2.h[0], v0.h[2] +; CHECK-NEXT: mov v0.h[0], v0.h[3] +; CHECK-NEXT: str h3, [sp, #26] +; CHECK-NEXT: mov v3.h[0], v5.h[2] +; CHECK-NEXT: str h4, [sp, #24] +; CHECK-NEXT: str h1, [sp, #20] +; CHECK-NEXT: mov v1.h[0], v5.h[4] +; CHECK-NEXT: str h2, [sp, #18] +; CHECK-NEXT: mov v2.h[0], v5.h[5] +; CHECK-NEXT: str h0, [sp, #16] +; CHECK-NEXT: mov v0.h[0], v5.h[6] +; CHECK-NEXT: str h3, [sp, #2] +; CHECK-NEXT: str h1, [sp, #14] +; CHECK-NEXT: mov v1.h[0], v5.h[7] +; CHECK-NEXT: str h2, [sp, #12] +; CHECK-NEXT: mov v2.h[0], v5.h[1] +; CHECK-NEXT: str h0, [sp, #10] +; CHECK-NEXT: str h1, [sp, #8] +; CHECK-NEXT: str h2, [sp, #4] ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] ; CHECK-NEXT: st1h { z0.h }, p0, [x2] ; CHECK-NEXT: mov sp, x29 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll index 27aa5019fb259..e8c9704940c70 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll @@ -73,7 +73,8 @@ define void @alloc_v6i8(ptr %st_ptr) nounwind { ; CHECK-NEXT: zip1 z1.s, z1.s, z0.s ; CHECK-NEXT: st1b { z1.h }, p0, [x8] ; CHECK-NEXT: ld1h { z1.s }, p1/z, [x8] -; CHECK-NEXT: stur b0, [x19, #2] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: strb w8, [x19, #2] ; CHECK-NEXT: str h1, [x19] ; CHECK-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #32 diff --git a/llvm/test/CodeGen/AArch64/tbl-loops.ll b/llvm/test/CodeGen/AArch64/tbl-loops.ll index 0ad9900865518..8f5359bcaa044 100644 --- a/llvm/test/CodeGen/AArch64/tbl-loops.ll +++ b/llvm/test/CodeGen/AArch64/tbl-loops.ll @@ -346,7 +346,6 @@ define void @loop3(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n ; CHECK-NEXT: .LBB2_4: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ld3 { v2.4s, v3.4s, v4.4s }, [x1], #48 -; CHECK-NEXT: add x13, x0, #8 ; CHECK-NEXT: subs x12, x12, #4 ; CHECK-NEXT: fcmgt v5.4s, v2.4s, v0.4s ; CHECK-NEXT: fcmgt v6.4s, v3.4s, v0.4s @@ -367,8 +366,10 @@ define void @loop3(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n ; CHECK-NEXT: xtn v6.4h, v4.4s ; CHECK-NEXT: xtn v7.4h, v2.4s ; CHECK-NEXT: tbl v2.16b, { v5.16b, v6.16b, v7.16b }, v1.16b -; CHECK-NEXT: st1 { v2.s }[2], [x13] -; CHECK-NEXT: str d2, [x0], #12 +; CHECK-NEXT: mov v3.s[0], v2.s[2] +; CHECK-NEXT: str d2, [x0] +; CHECK-NEXT: str s3, [x0, #8] +; CHECK-NEXT: add x0, x0, #12 ; CHECK-NEXT: b.ne .LBB2_4 ; CHECK-NEXT: // %bb.5: // %middle.block ; CHECK-NEXT: cmp x11, x10 diff --git a/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll b/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll index f2389b3e94846..a2ebc416e042f 100644 --- a/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll +++ b/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll @@ -706,11 +706,10 @@ define void @trunc_v11i64_to_v11i8_in_loop(ptr %A, ptr %dst) { ; CHECK-NEXT: LBB6_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldp q4, q0, [x0, #48] -; CHECK-NEXT: add x9, x1, #8 -; CHECK-NEXT: ldr d1, [x0, #80] +; CHECK-NEXT: subs x8, x8, #1 ; CHECK-NEXT: ldp q3, q2, [x0] +; CHECK-NEXT: ldr d1, [x0, #80] ; CHECK-NEXT: ldr q5, [x0, #32] -; CHECK-NEXT: subs x8, x8, #1 ; CHECK-NEXT: add x0, x0, #128 ; CHECK-NEXT: uzp1.4s v0, v0, v1 ; CHECK-NEXT: uzp1.4s v1, v5, v4 @@ -720,9 +719,10 @@ define void @trunc_v11i64_to_v11i8_in_loop(ptr %A, ptr %dst) { ; CHECK-NEXT: uzp1.8b v2, v0, v0 ; CHECK-NEXT: uzp1.16b v0, v1, v0 ; CHECK-NEXT: mov.b v1[0], v2[2] +; CHECK-NEXT: mov.h v2[0], v0[4] ; CHECK-NEXT: str d0, [x1] -; CHECK-NEXT: st1.h { v0 }[4], [x9] ; CHECK-NEXT: stur b1, [x1, #10] +; CHECK-NEXT: str h2, [x1, #8] ; CHECK-NEXT: add x1, x1, #16 ; CHECK-NEXT: b.eq LBB6_1 ; CHECK-NEXT: ; %bb.2: ; %exit @@ -743,23 +743,23 @@ define void @trunc_v11i64_to_v11i8_in_loop(ptr %A, ptr %dst) { ; CHECK-BE-NEXT: ld1 { v2.2d }, [x9] ; CHECK-BE-NEXT: ldr d5, [x0, #80] ; CHECK-BE-NEXT: ld1 { v4.2d }, [x10] -; CHECK-BE-NEXT: add x9, x1, #8 ; CHECK-BE-NEXT: subs x8, x8, #1 +; CHECK-BE-NEXT: add x0, x0, #128 ; CHECK-BE-NEXT: uzp1 v1.4s, v3.4s, v1.4s ; CHECK-BE-NEXT: uzp1 v0.4s, v0.4s, v5.4s -; CHECK-BE-NEXT: add x0, x0, #128 ; CHECK-BE-NEXT: uzp1 v2.4s, v4.4s, v2.4s ; CHECK-BE-NEXT: xtn v0.4h, v0.4s ; CHECK-BE-NEXT: uzp1 v1.8h, v1.8h, v2.8h ; CHECK-BE-NEXT: uzp1 v1.16b, v1.16b, v0.16b ; CHECK-BE-NEXT: uzp1 v0.8b, v0.8b, v0.8b ; CHECK-BE-NEXT: rev16 v2.16b, v1.16b -; CHECK-BE-NEXT: rev64 v1.16b, v1.16b ; CHECK-BE-NEXT: mov v0.b[0], v0.b[2] -; CHECK-BE-NEXT: str d1, [x1] +; CHECK-BE-NEXT: rev64 v1.16b, v1.16b +; CHECK-BE-NEXT: mov v2.h[0], v2.h[4] ; CHECK-BE-NEXT: stur b0, [x1, #10] +; CHECK-BE-NEXT: str d1, [x1] +; CHECK-BE-NEXT: str h2, [x1, #8] ; CHECK-BE-NEXT: add x1, x1, #16 -; CHECK-BE-NEXT: st1 { v2.h }[4], [x9] ; CHECK-BE-NEXT: b.eq .LBB6_1 ; CHECK-BE-NEXT: // %bb.2: // %exit ; CHECK-BE-NEXT: ret @@ -779,23 +779,23 @@ define void @trunc_v11i64_to_v11i8_in_loop(ptr %A, ptr %dst) { ; CHECK-DISABLE-NEXT: ld1 { v2.2d }, [x9] ; CHECK-DISABLE-NEXT: ldr d5, [x0, #80] ; CHECK-DISABLE-NEXT: ld1 { v4.2d }, [x10] -; CHECK-DISABLE-NEXT: add x9, x1, #8 ; CHECK-DISABLE-NEXT: subs x8, x8, #1 +; CHECK-DISABLE-NEXT: add x0, x0, #128 ; CHECK-DISABLE-NEXT: uzp1 v1.4s, v3.4s, v1.4s ; CHECK-DISABLE-NEXT: uzp1 v0.4s, v0.4s, v5.4s -; CHECK-DISABLE-NEXT: add x0, x0, #128 ; CHECK-DISABLE-NEXT: uzp1 v2.4s, v4.4s, v2.4s ; CHECK-DISABLE-NEXT: xtn v0.4h, v0.4s ; CHECK-DISABLE-NEXT: uzp1 v1.8h, v1.8h, v2.8h ; CHECK-DISABLE-NEXT: uzp1 v1.16b, v1.16b, v0.16b ; CHECK-DISABLE-NEXT: uzp1 v0.8b, v0.8b, v0.8b ; CHECK-DISABLE-NEXT: rev16 v2.16b, v1.16b -; CHECK-DISABLE-NEXT: rev64 v1.16b, v1.16b ; CHECK-DISABLE-NEXT: mov v0.b[0], v0.b[2] -; CHECK-DISABLE-NEXT: str d1, [x1] +; CHECK-DISABLE-NEXT: rev64 v1.16b, v1.16b +; CHECK-DISABLE-NEXT: mov v2.h[0], v2.h[4] ; CHECK-DISABLE-NEXT: stur b0, [x1, #10] +; CHECK-DISABLE-NEXT: str d1, [x1] +; CHECK-DISABLE-NEXT: str h2, [x1, #8] ; CHECK-DISABLE-NEXT: add x1, x1, #16 -; CHECK-DISABLE-NEXT: st1 { v2.h }[4], [x9] ; CHECK-DISABLE-NEXT: b.eq .LBB6_1 ; CHECK-DISABLE-NEXT: // %bb.2: // %exit ; CHECK-DISABLE-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll index 19178964710cd..4e06e062e0c7d 100644 --- a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll @@ -254,7 +254,7 @@ define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-SD-NEXT: mov v1.s[1], w11 ; CHECK-SD-NEXT: add v0.2s, v0.2s, v1.2s ; CHECK-SD-NEXT: umin v0.2s, v0.2s, v2.2s -; CHECK-SD-NEXT: mov s1, v0.s[1] +; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] ; CHECK-SD-NEXT: str h0, [x2] ; CHECK-SD-NEXT: str h1, [x2, #2] ; CHECK-SD-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll index 443bd46bb71da..fc6c756f1b17e 100644 --- a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll @@ -251,7 +251,7 @@ define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-SD-NEXT: mov v0.s[1], w10 ; CHECK-SD-NEXT: mov v1.s[1], w11 ; CHECK-SD-NEXT: uqsub v0.2s, v0.2s, v1.2s -; CHECK-SD-NEXT: mov s1, v0.s[1] +; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] ; CHECK-SD-NEXT: str h0, [x2] ; CHECK-SD-NEXT: str h1, [x2, #2] ; CHECK-SD-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll index 66b37d1913505..1529a17b2a70d 100644 --- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll +++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll @@ -443,11 +443,11 @@ define void @load_ext_to_64bits(ptr %src, ptr %dst) { ; CHECK-NEXT: ldrh w9, [x0] ; CHECK-NEXT: orr w8, w9, w8, lsl #16 ; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: add x8, x1, #4 ; CHECK-NEXT: zip1.8b v0, v0, v0 ; CHECK-NEXT: bic.4h v0, #255, lsl #8 -; CHECK-NEXT: st1.h { v0 }[2], [x8] +; CHECK-NEXT: mov.h v1[0], v0[2] ; CHECK-NEXT: str s0, [x1] +; CHECK-NEXT: str h1, [x1, #4] ; CHECK-NEXT: ret ; ; BE-LABEL: load_ext_to_64bits: @@ -461,11 +461,11 @@ define void @load_ext_to_64bits(ptr %src, ptr %dst) { ; BE-NEXT: rev32 v0.8b, v0.8b ; BE-NEXT: ushll v0.8h, v0.8b, #0 ; BE-NEXT: ld1 { v0.b }[4], [x8] -; BE-NEXT: add x8, x1, #4 ; BE-NEXT: bic v0.4h, #255, lsl #8 -; BE-NEXT: rev32 v1.8h, v0.8h -; BE-NEXT: st1 { v0.h }[2], [x8] -; BE-NEXT: str s1, [x1] +; BE-NEXT: mov v1.h[0], v0.h[2] +; BE-NEXT: rev32 v0.8h, v0.8h +; BE-NEXT: str h1, [x1, #4] +; BE-NEXT: str s0, [x1] ; BE-NEXT: add sp, sp, #16 ; BE-NEXT: ret entry: @@ -479,23 +479,23 @@ define void @load_ext_to_64bits_default_align(ptr %src, ptr %dst) { ; CHECK-LABEL: load_ext_to_64bits_default_align: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: ldr s0, [x0] -; CHECK-NEXT: add x8, x1, #4 ; CHECK-NEXT: zip1.8b v0, v0, v0 ; CHECK-NEXT: bic.4h v0, #255, lsl #8 -; CHECK-NEXT: st1.h { v0 }[2], [x8] +; CHECK-NEXT: mov.h v1[0], v0[2] ; CHECK-NEXT: str s0, [x1] +; CHECK-NEXT: str h1, [x1, #4] ; CHECK-NEXT: ret ; ; BE-LABEL: load_ext_to_64bits_default_align: ; BE: // %bb.0: // %entry ; BE-NEXT: ldr s0, [x0] -; BE-NEXT: add x8, x1, #4 ; BE-NEXT: rev32 v0.8b, v0.8b ; BE-NEXT: zip1 v0.8b, v0.8b, v0.8b ; BE-NEXT: bic v0.4h, #255, lsl #8 -; BE-NEXT: rev32 v1.8h, v0.8h -; BE-NEXT: st1 { v0.h }[2], [x8] -; BE-NEXT: str s1, [x1] +; BE-NEXT: mov v1.h[0], v0.h[2] +; BE-NEXT: rev32 v0.8h, v0.8h +; BE-NEXT: str h1, [x1, #4] +; BE-NEXT: str s0, [x1] ; BE-NEXT: ret entry: %l = load <3 x i8>, ptr %src @@ -508,23 +508,23 @@ define void @load_ext_to_64bits_align_4(ptr %src, ptr %dst) { ; CHECK-LABEL: load_ext_to_64bits_align_4: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: ldr s0, [x0] -; CHECK-NEXT: add x8, x1, #4 ; CHECK-NEXT: zip1.8b v0, v0, v0 ; CHECK-NEXT: bic.4h v0, #255, lsl #8 -; CHECK-NEXT: st1.h { v0 }[2], [x8] +; CHECK-NEXT: mov.h v1[0], v0[2] ; CHECK-NEXT: str s0, [x1] +; CHECK-NEXT: str h1, [x1, #4] ; CHECK-NEXT: ret ; ; BE-LABEL: load_ext_to_64bits_align_4: ; BE: // %bb.0: // %entry ; BE-NEXT: ldr s0, [x0] -; BE-NEXT: add x8, x1, #4 ; BE-NEXT: rev32 v0.8b, v0.8b ; BE-NEXT: zip1 v0.8b, v0.8b, v0.8b ; BE-NEXT: bic v0.4h, #255, lsl #8 -; BE-NEXT: rev32 v1.8h, v0.8h -; BE-NEXT: st1 { v0.h }[2], [x8] -; BE-NEXT: str s1, [x1] +; BE-NEXT: mov v1.h[0], v0.h[2] +; BE-NEXT: rev32 v0.8h, v0.8h +; BE-NEXT: str h1, [x1, #4] +; BE-NEXT: str s0, [x1] ; BE-NEXT: ret entry: %l = load <3 x i8>, ptr %src, align 4 @@ -542,14 +542,14 @@ define void @load_ext_add_to_64bits(ptr %src, ptr %dst) { ; CHECK-NEXT: adrp x8, lCPI15_0@PAGE ; CHECK-NEXT: Lloh3: ; CHECK-NEXT: ldr d1, [x8, lCPI15_0@PAGEOFF] -; CHECK-NEXT: add x8, x1, #4 ; CHECK-NEXT: orr w9, w10, w9, lsl #16 ; CHECK-NEXT: fmov s0, w9 ; CHECK-NEXT: zip1.8b v0, v0, v0 ; CHECK-NEXT: bic.4h v0, #255, lsl #8 ; CHECK-NEXT: add.4h v0, v0, v1 -; CHECK-NEXT: st1.h { v0 }[2], [x8] +; CHECK-NEXT: mov.h v1[0], v0[2] ; CHECK-NEXT: str s0, [x1] +; CHECK-NEXT: str h1, [x1, #4] ; CHECK-NEXT: ret ; CHECK-NEXT: .loh AdrpLdr Lloh2, Lloh3 ; @@ -567,12 +567,12 @@ define void @load_ext_add_to_64bits(ptr %src, ptr %dst) { ; BE-NEXT: adrp x8, .LCPI15_0 ; BE-NEXT: add x8, x8, :lo12:.LCPI15_0 ; BE-NEXT: ld1 { v1.4h }, [x8] -; BE-NEXT: add x8, x1, #4 ; BE-NEXT: bic v0.4h, #255, lsl #8 ; BE-NEXT: add v0.4h, v0.4h, v1.4h -; BE-NEXT: rev32 v1.8h, v0.8h -; BE-NEXT: st1 { v0.h }[2], [x8] -; BE-NEXT: str s1, [x1] +; BE-NEXT: mov v1.h[0], v0.h[2] +; BE-NEXT: rev32 v0.8h, v0.8h +; BE-NEXT: str h1, [x1, #4] +; BE-NEXT: str s0, [x1] ; BE-NEXT: add sp, sp, #16 ; BE-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/vec_uaddo.ll b/llvm/test/CodeGen/AArch64/vec_uaddo.ll index 09662aef7e423..f9654fdb41bbc 100644 --- a/llvm/test/CodeGen/AArch64/vec_uaddo.ll +++ b/llvm/test/CodeGen/AArch64/vec_uaddo.ll @@ -50,10 +50,10 @@ define <3 x i32> @uaddo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind { ; CHECK-LABEL: uaddo_v3i32: ; CHECK: // %bb.0: ; CHECK-NEXT: add v1.4s, v0.4s, v1.4s -; CHECK-NEXT: add x8, x0, #8 +; CHECK-NEXT: mov v2.s[0], v1.s[2] ; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s -; CHECK-NEXT: st1 { v1.s }[2], [x8] ; CHECK-NEXT: str d1, [x0] +; CHECK-NEXT: str s2, [x0, #8] ; CHECK-NEXT: ret %t = call {<3 x i32>, <3 x i1>} @llvm.uadd.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1) %val = extractvalue {<3 x i32>, <3 x i1>} %t, 0 @@ -212,26 +212,26 @@ define <4 x i32> @uaddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: bic v1.4s, #255, lsl #24 ; CHECK-NEXT: bic v0.4s, #255, lsl #24 -; CHECK-NEXT: add v0.4s, v0.4s, v1.4s -; CHECK-NEXT: mov v1.16b, v0.16b -; CHECK-NEXT: mov w8, v0.s[3] -; CHECK-NEXT: mov w9, v0.s[2] -; CHECK-NEXT: mov w10, v0.s[1] -; CHECK-NEXT: fmov w11, s0 -; CHECK-NEXT: bic v1.4s, #1, lsl #24 +; CHECK-NEXT: add v1.4s, v0.4s, v1.4s +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: mov w8, v1.s[3] +; CHECK-NEXT: mov w10, v1.s[1] +; CHECK-NEXT: mov w9, v1.s[2] +; CHECK-NEXT: str h1, [x0] +; CHECK-NEXT: bic v0.4s, #1, lsl #24 ; CHECK-NEXT: sturh w8, [x0, #9] ; CHECK-NEXT: lsr w8, w8, #16 -; CHECK-NEXT: strh w9, [x0, #6] -; CHECK-NEXT: lsr w9, w9, #16 -; CHECK-NEXT: cmeq v1.4s, v1.4s, v0.4s +; CHECK-NEXT: sturh w10, [x0, #3] +; CHECK-NEXT: cmeq v0.4s, v0.4s, v1.4s ; CHECK-NEXT: strb w8, [x0, #11] ; CHECK-NEXT: lsr w8, w10, #16 -; CHECK-NEXT: strb w9, [x0, #8] -; CHECK-NEXT: lsr w9, w11, #16 -; CHECK-NEXT: sturh w10, [x0, #3] -; CHECK-NEXT: mvn v0.16b, v1.16b -; CHECK-NEXT: strh w11, [x0] +; CHECK-NEXT: fmov w10, s1 +; CHECK-NEXT: strh w9, [x0, #6] +; CHECK-NEXT: lsr w9, w9, #16 ; CHECK-NEXT: strb w8, [x0, #5] +; CHECK-NEXT: mvn v0.16b, v0.16b +; CHECK-NEXT: strb w9, [x0, #8] +; CHECK-NEXT: lsr w9, w10, #16 ; CHECK-NEXT: strb w9, [x0, #2] ; CHECK-NEXT: ret %t = call {<4 x i24>, <4 x i1>} @llvm.uadd.with.overflow.v4i24(<4 x i24> %a0, <4 x i24> %a1) diff --git a/llvm/test/CodeGen/AArch64/vec_umulo.ll b/llvm/test/CodeGen/AArch64/vec_umulo.ll index 7803c095b77c2..4f4f1ed8f8ed3 100644 --- a/llvm/test/CodeGen/AArch64/vec_umulo.ll +++ b/llvm/test/CodeGen/AArch64/vec_umulo.ll @@ -55,13 +55,12 @@ define <3 x i32> @umulo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s ; CHECK-NEXT: umull v3.2d, v0.2s, v1.2s -; CHECK-NEXT: add x8, x0, #8 ; CHECK-NEXT: mul v1.4s, v0.4s, v1.4s ; CHECK-NEXT: uzp2 v2.4s, v3.4s, v2.4s -; CHECK-NEXT: st1 { v1.s }[2], [x8] ; CHECK-NEXT: str d1, [x0] -; CHECK-NEXT: cmtst v2.4s, v2.4s, v2.4s -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: cmtst v0.4s, v2.4s, v2.4s +; CHECK-NEXT: mov v2.s[0], v1.s[2] +; CHECK-NEXT: str s2, [x0, #8] ; CHECK-NEXT: ret %t = call {<3 x i32>, <3 x i1>} @llvm.umul.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1) %val = extractvalue {<3 x i32>, <3 x i1>} %t, 0 @@ -260,27 +259,27 @@ define <4 x i32> @umulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { ; CHECK-NEXT: bic v0.4s, #255, lsl #24 ; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s ; CHECK-NEXT: umull v3.2d, v0.2s, v1.2s -; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: uzp2 v1.4s, v3.4s, v2.4s -; CHECK-NEXT: ushr v2.4s, v0.4s, #24 -; CHECK-NEXT: mov w8, v0.s[3] -; CHECK-NEXT: mov w9, v0.s[2] -; CHECK-NEXT: mov w10, v0.s[1] -; CHECK-NEXT: fmov w11, s0 +; CHECK-NEXT: mul v1.4s, v0.4s, v1.4s +; CHECK-NEXT: mov w8, v1.s[3] +; CHECK-NEXT: uzp2 v0.4s, v3.4s, v2.4s +; CHECK-NEXT: ushr v2.4s, v1.4s, #24 +; CHECK-NEXT: mov w10, v1.s[1] +; CHECK-NEXT: mov w9, v1.s[2] +; CHECK-NEXT: str h1, [x0] ; CHECK-NEXT: cmtst v2.4s, v2.4s, v2.4s -; CHECK-NEXT: cmeq v1.4s, v1.4s, #0 ; CHECK-NEXT: sturh w8, [x0, #9] ; CHECK-NEXT: lsr w8, w8, #16 -; CHECK-NEXT: strh w9, [x0, #6] -; CHECK-NEXT: lsr w9, w9, #16 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: sturh w10, [x0, #3] ; CHECK-NEXT: strb w8, [x0, #11] ; CHECK-NEXT: lsr w8, w10, #16 -; CHECK-NEXT: orn v0.16b, v2.16b, v1.16b -; CHECK-NEXT: strb w9, [x0, #8] -; CHECK-NEXT: lsr w9, w11, #16 -; CHECK-NEXT: sturh w10, [x0, #3] -; CHECK-NEXT: strh w11, [x0] +; CHECK-NEXT: fmov w10, s1 +; CHECK-NEXT: strh w9, [x0, #6] +; CHECK-NEXT: lsr w9, w9, #16 +; CHECK-NEXT: orn v0.16b, v2.16b, v0.16b ; CHECK-NEXT: strb w8, [x0, #5] +; CHECK-NEXT: strb w9, [x0, #8] +; CHECK-NEXT: lsr w9, w10, #16 ; CHECK-NEXT: strb w9, [x0, #2] ; CHECK-NEXT: ret %t = call {<4 x i24>, <4 x i1>} @llvm.umul.with.overflow.v4i24(<4 x i24> %a0, <4 x i24> %a1) diff --git a/llvm/test/CodeGen/AArch64/vector-compress.ll b/llvm/test/CodeGen/AArch64/vector-compress.ll index f990bdc2e5615..672b6af11cfc3 100644 --- a/llvm/test/CodeGen/AArch64/vector-compress.ll +++ b/llvm/test/CodeGen/AArch64/vector-compress.ll @@ -209,44 +209,44 @@ define <8 x i32> @test_compress_large(<8 x i32> %vec, <8 x i1> %mask) { ; CHECK-NEXT: ; kill: def $d2 killed $d2 def $q2 ; CHECK-NEXT: umov.b w9, v2[0] ; CHECK-NEXT: umov.b w10, v2[1] -; CHECK-NEXT: mov x12, sp -; CHECK-NEXT: umov.b w11, v2[2] +; CHECK-NEXT: mov x11, sp +; CHECK-NEXT: umov.b w12, v2[2] ; CHECK-NEXT: umov.b w13, v2[3] ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: umov.b w14, v2[4] +; CHECK-NEXT: umov.b w15, v2[4] ; CHECK-NEXT: str s0, [sp] +; CHECK-NEXT: mov.s v3[0], v0[3] ; CHECK-NEXT: and x10, x10, #0x1 -; CHECK-NEXT: and x15, x9, #0x1 -; CHECK-NEXT: bfi x12, x9, #2, #1 -; CHECK-NEXT: and x9, x11, #0x1 -; CHECK-NEXT: add x10, x15, x10 -; CHECK-NEXT: umov.b w11, v2[5] -; CHECK-NEXT: add x9, x10, x9 -; CHECK-NEXT: orr x15, x8, x10, lsl #2 -; CHECK-NEXT: umov.b w10, v2[6] -; CHECK-NEXT: st1.s { v0 }[1], [x12] -; CHECK-NEXT: add x12, x8, x9, lsl #2 -; CHECK-NEXT: and x13, x13, #0x1 -; CHECK-NEXT: st1.s { v0 }[2], [x15] -; CHECK-NEXT: add x9, x9, x13 -; CHECK-NEXT: st1.s { v0 }[3], [x12] -; CHECK-NEXT: and x12, x14, #0x1 -; CHECK-NEXT: and x11, x11, #0x1 +; CHECK-NEXT: and x14, x9, #0x1 +; CHECK-NEXT: bfi x11, x9, #2, #1 +; CHECK-NEXT: add x9, x14, x10 +; CHECK-NEXT: umov.b w10, v2[5] +; CHECK-NEXT: st1.s { v0 }[1], [x11] +; CHECK-NEXT: and x11, x12, #0x1 +; CHECK-NEXT: orr x14, x8, x9, lsl #2 +; CHECK-NEXT: and x12, x13, #0x1 +; CHECK-NEXT: add x9, x9, x11 +; CHECK-NEXT: umov.b w11, v2[6] +; CHECK-NEXT: and x13, x15, #0x1 ; CHECK-NEXT: add x12, x9, x12 -; CHECK-NEXT: and w10, w10, #0x1 -; CHECK-NEXT: and x9, x9, #0x7 -; CHECK-NEXT: add x11, x12, x11 +; CHECK-NEXT: st1.s { v0 }[2], [x14] +; CHECK-NEXT: str s3, [x8, x9, lsl #2] +; CHECK-NEXT: and x9, x10, #0x1 +; CHECK-NEXT: add x10, x12, x13 ; CHECK-NEXT: and x12, x12, #0x7 -; CHECK-NEXT: str s1, [x8, x9, lsl #2] -; CHECK-NEXT: add w10, w11, w10 -; CHECK-NEXT: and x11, x11, #0x7 -; CHECK-NEXT: add x12, x8, x12, lsl #2 +; CHECK-NEXT: add x9, x10, x9 ; CHECK-NEXT: and x10, x10, #0x7 -; CHECK-NEXT: add x9, x8, x11, lsl #2 -; CHECK-NEXT: add x8, x8, x10, lsl #2 -; CHECK-NEXT: st1.s { v1 }[1], [x12] -; CHECK-NEXT: st1.s { v1 }[2], [x9] -; CHECK-NEXT: st1.s { v1 }[3], [x8] +; CHECK-NEXT: str s1, [x8, x12, lsl #2] +; CHECK-NEXT: and x12, x9, #0x7 +; CHECK-NEXT: mov.s v0[0], v1[3] +; CHECK-NEXT: and w11, w11, #0x1 +; CHECK-NEXT: add x10, x8, x10, lsl #2 +; CHECK-NEXT: add x12, x8, x12, lsl #2 +; CHECK-NEXT: add w9, w9, w11 +; CHECK-NEXT: and x9, x9, #0x7 +; CHECK-NEXT: st1.s { v1 }[1], [x10] +; CHECK-NEXT: st1.s { v1 }[2], [x12] +; CHECK-NEXT: str s0, [x8, x9, lsl #2] ; CHECK-NEXT: ldp q0, q1, [sp], #32 ; CHECK-NEXT: ret %out = call <8 x i32> @llvm.experimental.vector.compress(<8 x i32> %vec, <8 x i1> %mask, <8 x i32> undef) diff --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll index e3c4fe44d201d..70e468a2b7586 100644 --- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll +++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll @@ -1570,36 +1570,36 @@ define void @zext_v8i8_to_v8i128_in_loop(ptr %src, ptr %dst) { ; CHECK-NEXT: LBB16_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr d0, [x0, x8] -; CHECK-NEXT: add x9, x1, #112 ; CHECK-NEXT: add x8, x8, #16 ; CHECK-NEXT: str xzr, [x1, #120] +; CHECK-NEXT: str xzr, [x1, #104] ; CHECK-NEXT: cmp x8, #128 ; CHECK-NEXT: ushll.8h v0, v0, #0 -; CHECK-NEXT: str xzr, [x1, #104] ; CHECK-NEXT: str xzr, [x1, #88] ; CHECK-NEXT: str xzr, [x1, #72] +; CHECK-NEXT: str xzr, [x1, #56] ; CHECK-NEXT: ushll2.4s v1, v0, #0 ; CHECK-NEXT: ushll.4s v0, v0, #0 -; CHECK-NEXT: str xzr, [x1, #56] ; CHECK-NEXT: str xzr, [x1, #40] ; CHECK-NEXT: str xzr, [x1, #24] +; CHECK-NEXT: str xzr, [x1, #8] ; CHECK-NEXT: ushll2.2d v2, v1, #0 ; CHECK-NEXT: ushll.2d v1, v1, #0 ; CHECK-NEXT: ushll2.2d v3, v0, #0 ; CHECK-NEXT: ushll.2d v0, v0, #0 -; CHECK-NEXT: str xzr, [x1, #8] -; CHECK-NEXT: st1.d { v2 }[1], [x9] -; CHECK-NEXT: add x9, x1, #80 -; CHECK-NEXT: st1.d { v1 }[1], [x9] -; CHECK-NEXT: add x9, x1, #48 ; CHECK-NEXT: str d2, [x1, #96] -; CHECK-NEXT: st1.d { v3 }[1], [x9] -; CHECK-NEXT: add x9, x1, #16 +; CHECK-NEXT: mov.d v2[0], v2[1] ; CHECK-NEXT: str d1, [x1, #64] +; CHECK-NEXT: mov.d v1[0], v1[1] ; CHECK-NEXT: str d3, [x1, #32] +; CHECK-NEXT: mov.d v3[0], v3[1] ; CHECK-NEXT: str d0, [x1] +; CHECK-NEXT: mov.d v0[0], v0[1] +; CHECK-NEXT: str d2, [x1, #112] +; CHECK-NEXT: str d1, [x1, #80] +; CHECK-NEXT: str d3, [x1, #48] +; CHECK-NEXT: str d0, [x1, #16] ; CHECK-NEXT: add x1, x1, #256 -; CHECK-NEXT: st1.d { v0 }[1], [x9] ; CHECK-NEXT: b.ne LBB16_1 ; CHECK-NEXT: ; %bb.2: ; %exit ; CHECK-NEXT: ret @@ -1612,10 +1612,9 @@ define void @zext_v8i8_to_v8i128_in_loop(ptr %src, ptr %dst) { ; CHECK-BE-NEXT: add x9, x0, x8 ; CHECK-BE-NEXT: add x8, x8, #16 ; CHECK-BE-NEXT: ld1 { v0.8b }, [x9] -; CHECK-BE-NEXT: add x9, x1, #120 ; CHECK-BE-NEXT: str xzr, [x1, #112] -; CHECK-BE-NEXT: str xzr, [x1, #96] ; CHECK-BE-NEXT: cmp x8, #128 +; CHECK-BE-NEXT: str xzr, [x1, #96] ; CHECK-BE-NEXT: str xzr, [x1, #80] ; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-BE-NEXT: str xzr, [x1, #64] @@ -1629,18 +1628,19 @@ define void @zext_v8i8_to_v8i128_in_loop(ptr %src, ptr %dst) { ; CHECK-BE-NEXT: ushll v1.2d, v1.2s, #0 ; CHECK-BE-NEXT: ushll2 v3.2d, v0.4s, #0 ; CHECK-BE-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-BE-NEXT: st1 { v2.d }[1], [x9] -; CHECK-BE-NEXT: add x9, x1, #88 -; CHECK-BE-NEXT: st1 { v1.d }[1], [x9] -; CHECK-BE-NEXT: add x9, x1, #56 ; CHECK-BE-NEXT: str d2, [x1, #104] -; CHECK-BE-NEXT: st1 { v3.d }[1], [x9] -; CHECK-BE-NEXT: add x9, x1, #24 +; CHECK-BE-NEXT: mov v2.d[0], v2.d[1] ; CHECK-BE-NEXT: str d1, [x1, #72] +; CHECK-BE-NEXT: mov v1.d[0], v1.d[1] ; CHECK-BE-NEXT: str d3, [x1, #40] +; CHECK-BE-NEXT: mov v3.d[0], v3.d[1] ; CHECK-BE-NEXT: str d0, [x1, #8] +; CHECK-BE-NEXT: mov v0.d[0], v0.d[1] +; CHECK-BE-NEXT: str d2, [x1, #120] +; CHECK-BE-NEXT: str d1, [x1, #88] +; CHECK-BE-NEXT: str d3, [x1, #56] +; CHECK-BE-NEXT: str d0, [x1, #24] ; CHECK-BE-NEXT: add x1, x1, #256 -; CHECK-BE-NEXT: st1 { v0.d }[1], [x9] ; CHECK-BE-NEXT: b.ne .LBB16_1 ; CHECK-BE-NEXT: // %bb.2: // %exit ; CHECK-BE-NEXT: ret @@ -2031,58 +2031,7 @@ exit: ret void } -; CHECK-LABEL: lCPI20_0: -; CHECK-NEXT: .byte 0 ; 0x0 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 1 ; 0x1 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 2 ; 0x2 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 3 ; 0x3 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 4 ; 0x4 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 5 ; 0x5 -; CHECK-NEXT:lCPI20_1: -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 6 ; 0x6 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 7 ; 0x7 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 8 ; 0x8 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 9 ; 0x9 -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 10 ; 0xa -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT:lCPI20_2: -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 11 ; 0xb -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 12 ; 0xc -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 13 ; 0xd -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 14 ; 0xe -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 15 ; 0xf -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT: .byte 255 ; 0xff -; CHECK-NEXT:lCPI20_3: +; CHECK-LABEL: lCPI20_0: ; CHECK-NEXT: .byte 0 ; 0x0 ; CHECK-NEXT: .byte 255 ; 0xff ; CHECK-NEXT: .byte 255 ; 0xff @@ -2099,6 +2048,57 @@ exit: ; CHECK-NEXT: .byte 255 ; 0xff ; CHECK-NEXT: .byte 255 ; 0xff ; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: lCPI20_1: +; CHECK-NEXT: .byte 0 ; 0x0 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 1 ; 0x1 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 2 ; 0x2 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 3 ; 0x3 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 4 ; 0x4 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 5 ; 0x5 +; CHECK-NEXT: lCPI20_2: +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 6 ; 0x6 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 7 ; 0x7 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 8 ; 0x8 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 9 ; 0x9 +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 10 ; 0xa +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: lCPI20_3: +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 11 ; 0xb +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 12 ; 0xc +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 13 ; 0xd +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 14 ; 0xe +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 15 ; 0xf +; CHECK-NEXT: .byte 255 ; 0xff +; CHECK-NEXT: .byte 255 ; 0xff ; CHECK-BE-LABEL: .LCPI20_0: ; CHECK-BE-NEXT: .byte 255 // 0xff @@ -2193,18 +2193,18 @@ define void @zext_v20i8_to_v20i24_in_loop(ptr %src, ptr %dst) { ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: add x9, x0, x8 ; CHECK-NEXT: add x8, x8, #16 -; CHECK-NEXT: ldp q5, q4, [x9] -; CHECK-NEXT: add x9, x1, #56 +; CHECK-NEXT: ldp q4, q5, [x9] ; CHECK-NEXT: cmp x8, #128 -; CHECK-NEXT: tbl.16b v4, { v4 }, v3 -; CHECK-NEXT: tbl.16b v6, { v5 }, v2 -; CHECK-NEXT: tbl.16b v7, { v5 }, v1 ; CHECK-NEXT: tbl.16b v5, { v5 }, v0 +; CHECK-NEXT: tbl.16b v6, { v4 }, v3 +; CHECK-NEXT: tbl.16b v7, { v4 }, v2 +; CHECK-NEXT: tbl.16b v4, { v4 }, v1 ; CHECK-NEXT: stp q7, q6, [x1, #16] -; CHECK-NEXT: str q5, [x1] -; CHECK-NEXT: str d4, [x1, #48] +; CHECK-NEXT: mov.s v6[0], v5[2] +; CHECK-NEXT: str q4, [x1] +; CHECK-NEXT: str d5, [x1, #48] +; CHECK-NEXT: str s6, [x1, #56] ; CHECK-NEXT: add x1, x1, #64 -; CHECK-NEXT: st1.s { v4 }[2], [x9] ; CHECK-NEXT: b.ne LBB20_1 ; CHECK-NEXT: ; %bb.2: ; %exit ; CHECK-NEXT: ret @@ -2239,19 +2239,19 @@ define void @zext_v20i8_to_v20i24_in_loop(ptr %src, ptr %dst) { ; CHECK-BE-NEXT: ld1 { v4.16b }, [x10] ; CHECK-BE-NEXT: cmp x8, #128 ; CHECK-BE-NEXT: tbl v6.16b, { v5.16b }, v3.16b -; CHECK-BE-NEXT: tbl v7.16b, { v5.16b }, v2.16b +; CHECK-BE-NEXT: tbl v16.16b, { v5.16b }, v2.16b ; CHECK-BE-NEXT: tbl v5.16b, { v5.16b }, v1.16b ; CHECK-BE-NEXT: tbl v4.16b, { v4.16b }, v0.16b ; CHECK-BE-NEXT: st1 { v6.16b }, [x9] ; CHECK-BE-NEXT: add x9, x1, #16 -; CHECK-BE-NEXT: rev32 v16.16b, v4.16b +; CHECK-BE-NEXT: rev32 v7.16b, v4.16b ; CHECK-BE-NEXT: rev64 v4.16b, v4.16b -; CHECK-BE-NEXT: st1 { v7.16b }, [x9] -; CHECK-BE-NEXT: add x9, x1, #56 ; CHECK-BE-NEXT: st1 { v5.16b }, [x1] +; CHECK-BE-NEXT: st1 { v16.16b }, [x9] +; CHECK-BE-NEXT: mov v6.s[0], v7.s[2] ; CHECK-BE-NEXT: str d4, [x1, #48] +; CHECK-BE-NEXT: str s6, [x1, #56] ; CHECK-BE-NEXT: add x1, x1, #64 -; CHECK-BE-NEXT: st1 { v16.s }[2], [x9] ; CHECK-BE-NEXT: b.ne .LBB20_1 ; CHECK-BE-NEXT: // %bb.2: // %exit ; CHECK-BE-NEXT: ret @@ -2592,36 +2592,36 @@ define void @zext_v23i8_to_v23i48_in_loop(ptr %src, ptr %dst) { ; CHECK-BE-NEXT: ld1 { v7.16b }, [x9] ; CHECK-BE-NEXT: add x9, x9, #16 ; CHECK-BE-NEXT: cmp x8, #128 -; CHECK-BE-NEXT: ld1 { v17.16b }, [x9] +; CHECK-BE-NEXT: ld1 { v16.16b }, [x9] ; CHECK-BE-NEXT: add x9, x1, #80 -; CHECK-BE-NEXT: tbl v16.16b, { v7.16b }, v6.16b +; CHECK-BE-NEXT: tbl v17.16b, { v7.16b }, v6.16b ; CHECK-BE-NEXT: tbl v18.16b, { v7.16b }, v5.16b -; CHECK-BE-NEXT: tbl v19.16b, { v7.16b }, v4.16b -; CHECK-BE-NEXT: tbl v20.16b, { v7.16b }, v3.16b -; CHECK-BE-NEXT: tbl v21.16b, { v17.16b }, v0.16b -; CHECK-BE-NEXT: st1 { v16.16b }, [x9] +; CHECK-BE-NEXT: tbl v20.16b, { v7.16b }, v4.16b +; CHECK-BE-NEXT: tbl v19.16b, { v16.16b }, v0.16b +; CHECK-BE-NEXT: tbl v21.16b, { v7.16b }, v3.16b +; CHECK-BE-NEXT: st1 { v17.16b }, [x9] ; CHECK-BE-NEXT: add x9, x1, #64 -; CHECK-BE-NEXT: tbl v16.16b, { v7.16b }, v2.16b +; CHECK-BE-NEXT: tbl v17.16b, { v7.16b }, v2.16b ; CHECK-BE-NEXT: st1 { v18.16b }, [x9] ; CHECK-BE-NEXT: add x9, x1, #48 -; CHECK-BE-NEXT: tbl v18.16b, { v17.16b }, v2.16b -; CHECK-BE-NEXT: st1 { v19.16b }, [x9] -; CHECK-BE-NEXT: add x9, x1, #32 -; CHECK-BE-NEXT: tbl v17.16b, { v17.16b }, v1.16b +; CHECK-BE-NEXT: rev16 v18.16b, v19.16b ; CHECK-BE-NEXT: st1 { v20.16b }, [x9] +; CHECK-BE-NEXT: add x9, x1, #32 +; CHECK-BE-NEXT: tbl v20.16b, { v16.16b }, v2.16b +; CHECK-BE-NEXT: st1 { v21.16b }, [x9] ; CHECK-BE-NEXT: add x9, x1, #16 -; CHECK-BE-NEXT: rev64 v19.16b, v21.16b -; CHECK-BE-NEXT: st1 { v16.16b }, [x9] -; CHECK-BE-NEXT: rev16 v16.16b, v21.16b +; CHECK-BE-NEXT: tbl v16.16b, { v16.16b }, v1.16b +; CHECK-BE-NEXT: st1 { v17.16b }, [x9] +; CHECK-BE-NEXT: rev64 v17.16b, v19.16b ; CHECK-BE-NEXT: add x9, x1, #112 -; CHECK-BE-NEXT: st1 { v18.16b }, [x9] -; CHECK-BE-NEXT: add x9, x1, #96 ; CHECK-BE-NEXT: tbl v7.16b, { v7.16b }, v1.16b -; CHECK-BE-NEXT: st1 { v17.16b }, [x9] -; CHECK-BE-NEXT: add x9, x1, #136 -; CHECK-BE-NEXT: st1 { v16.h }[4], [x9] -; CHECK-BE-NEXT: fmov x9, d19 +; CHECK-BE-NEXT: mov v18.h[0], v18.h[4] +; CHECK-BE-NEXT: st1 { v20.16b }, [x9] +; CHECK-BE-NEXT: add x9, x1, #96 +; CHECK-BE-NEXT: st1 { v16.16b }, [x9] +; CHECK-BE-NEXT: fmov x9, d17 ; CHECK-BE-NEXT: st1 { v7.16b }, [x1] +; CHECK-BE-NEXT: str h18, [x1, #136] ; CHECK-BE-NEXT: str x9, [x1, #128]! ; CHECK-BE-NEXT: b.ne .LBB21_1 ; CHECK-BE-NEXT: // %bb.2: // %exit @@ -2702,29 +2702,28 @@ define void @zext_v8i8_to_v8i33_in_loop(ptr %src, ptr %dst) { ; CHECK-BE-NEXT: ushll2 v1.2d, v1.4s, #0 ; CHECK-BE-NEXT: ushll2 v3.2d, v0.4s, #0 ; CHECK-BE-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-BE-NEXT: mov x10, v1.d[1] ; CHECK-BE-NEXT: mov x9, v2.d[1] +; CHECK-BE-NEXT: mov x10, v1.d[1] ; CHECK-BE-NEXT: fmov x13, d1 ; CHECK-BE-NEXT: mov x11, v3.d[1] ; CHECK-BE-NEXT: mov x12, v0.d[1] -; CHECK-BE-NEXT: mov v1.d[0], v1.d[1] -; CHECK-BE-NEXT: orr x10, x10, x13, lsl #33 -; CHECK-BE-NEXT: fmov x13, d2 +; CHECK-BE-NEXT: fmov x14, d2 +; CHECK-BE-NEXT: fmov x15, d3 ; CHECK-BE-NEXT: lsl x9, x9, #2 +; CHECK-BE-NEXT: orr x13, x10, x13, lsl #33 +; CHECK-BE-NEXT: strb w10, [x1, #32] ; CHECK-BE-NEXT: lsl x11, x11, #4 ; CHECK-BE-NEXT: lsl x12, x12, #6 -; CHECK-BE-NEXT: stur b1, [x1, #32] -; CHECK-BE-NEXT: orr x13, x9, x13, lsl #35 -; CHECK-BE-NEXT: extr x9, x9, x10, #8 -; CHECK-BE-NEXT: fmov x10, d3 -; CHECK-BE-NEXT: orr x10, x11, x10, lsl #37 -; CHECK-BE-NEXT: extr x11, x11, x13, #8 +; CHECK-BE-NEXT: orr x14, x9, x14, lsl #35 +; CHECK-BE-NEXT: extr x9, x9, x13, #8 ; CHECK-BE-NEXT: fmov x13, d0 -; CHECK-BE-NEXT: stp x11, x9, [x1, #16] -; CHECK-BE-NEXT: extr x9, x12, x10, #8 -; CHECK-BE-NEXT: orr x13, x12, x13, lsl #39 -; CHECK-BE-NEXT: lsr x10, x13, #8 -; CHECK-BE-NEXT: stp x10, x9, [x1], #128 +; CHECK-BE-NEXT: orr x15, x11, x15, lsl #37 +; CHECK-BE-NEXT: extr x10, x11, x14, #8 +; CHECK-BE-NEXT: orr x11, x12, x13, lsl #39 +; CHECK-BE-NEXT: extr x12, x12, x15, #8 +; CHECK-BE-NEXT: stp x10, x9, [x1, #16] +; CHECK-BE-NEXT: lsr x9, x11, #8 +; CHECK-BE-NEXT: stp x9, x12, [x1], #128 ; CHECK-BE-NEXT: b.ne .LBB22_1 ; CHECK-BE-NEXT: // %bb.2: // %exit ; CHECK-BE-NEXT: ret From 9ce811e3c177942be4a5e347a68261b5d744c5ec Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Fri, 11 Apr 2025 12:29:08 +0000 Subject: [PATCH 08/12] Fixups --- llvm/lib/CodeGen/ValueTypes.cpp | 2 +- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 12 ++++++------ llvm/lib/Target/AArch64/AArch64InstrInfo.td | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/llvm/lib/CodeGen/ValueTypes.cpp b/llvm/lib/CodeGen/ValueTypes.cpp index b6d287295e1dd..10970b719fcae 100644 --- a/llvm/lib/CodeGen/ValueTypes.cpp +++ b/llvm/lib/CodeGen/ValueTypes.cpp @@ -224,7 +224,7 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const { case MVT::aarch64svcount: return TargetExtType::get(Context, "aarch64.svcount"); case MVT::aarch64mfp8: - return VectorType::get(IntegerType::get(Context, 8), ElementCount::getFixed(1)); + return FixedVectorType::get(IntegerType::get(Context, 8), 1); case MVT::x86amx: return Type::getX86_AMXTy(Context); case MVT::i64x8: return IntegerType::get(Context, 512); case MVT::amdgpuBufferFatPointer: return IntegerType::get(Context, 160); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 0b8bd93f50471..a9b32e2bb769f 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -24036,13 +24036,12 @@ static SDValue performSTORECombine(SDNode *N, if ((!ExtCst || !ExtCst->isZero()) && !Value.hasOneUse()) return SDValue(); + // These can lower to st1, which is preferable if we're unlikely to fold the + // addressing into the store. if (Subtarget->isNeonAvailable() && ElemVT == MemVT && (VectorVT.is64BitVector() || VectorVT.is128BitVector()) && ExtCst && - !ExtCst->isZero() && ST->getBasePtr().getOpcode() != ISD::ADD) { - // These can lower to st1, which is preferable if we're unlikely to fold - // the addressing into the store. + !ExtCst->isZero() && ST->getBasePtr().getOpcode() != ISD::ADD) return SDValue(); - } if (MemVT == MVT::i64 || MemVT == MVT::i32) { // Heuristic: If there are other users of w/x integer scalars extracted @@ -24066,10 +24065,11 @@ static SDValue performSTORECombine(SDNode *N, Value.getValueType(), Vector, ExtIdx); // FIXME: Using a fixed-size vector for the insertion should not be // necessary, but SVE ISEL is missing some folds to avoid fmovs. - SDValue Zero = DAG.getConstant(0, DL, MVT::i64); + SDValue Zero = DAG.getVectorIdxConstant(0, DL); EVT InsertVectorVT = EVT::getVectorVT( *DAG.getContext(), ElemVT, - VectorVT.getVectorElementCount().getKnownMinValue(), false); + ElementCount::getFixed( + VectorVT.getVectorElementCount().getKnownMinValue())); ExtVector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, InsertVectorVT, DAG.getUNDEF(InsertVectorVT), Ext, Zero); } diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 7ac36e42f56ae..aaa487cc5b0d1 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -4604,8 +4604,8 @@ def : Pat<(truncstorei8 GPR64:$Rt, (am_unscaled8 GPR64sp:$Rn, simm9:$offset)), // aarch64mfp8 (bsub) stores def : Pat<(store aarch64mfp8:$Rt, (am_unscaled8 GPR64sp:$Rn, simm9:$offset)), (STURBi FPR8:$Rt, GPR64sp:$Rn, simm9:$offset)>; -def : Pat<(store aarch64mfp8:$Rt, (am_indexed8 GPR64sp:$Rn, uimm12s4:$offset)), - (STRBui FPR8:$Rt, GPR64sp:$Rn, uimm12s4:$offset)>; +def : Pat<(store aarch64mfp8:$Rt, (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset)), + (STRBui FPR8:$Rt, GPR64sp:$Rn, uimm12s1:$offset)>; // Match stores from lane 0 to the appropriate subreg's store. multiclass VecStoreULane0Pat Date: Fri, 11 Apr 2025 13:56:56 +0000 Subject: [PATCH 09/12] Prefer scalar mov form --- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 30 ++++-- .../CodeGen/AArch64/aarch64-sve-ldst-one.ll | 24 ++--- llvm/test/CodeGen/AArch64/add.ll | 8 +- llvm/test/CodeGen/AArch64/andorxor.ll | 24 ++--- llvm/test/CodeGen/AArch64/arm64-neon-copy.ll | 6 +- llvm/test/CodeGen/AArch64/arm64-rev.ll | 2 +- llvm/test/CodeGen/AArch64/arm64-st1.ll | 56 +++++------ llvm/test/CodeGen/AArch64/bitcast-v2i8.ll | 2 +- llvm/test/CodeGen/AArch64/bitcast.ll | 4 +- llvm/test/CodeGen/AArch64/concat-vector.ll | 4 +- llvm/test/CodeGen/AArch64/ctlz.ll | 8 +- llvm/test/CodeGen/AArch64/ctpop.ll | 8 +- llvm/test/CodeGen/AArch64/cttz.ll | 8 +- llvm/test/CodeGen/AArch64/insertextract.ll | 4 +- llvm/test/CodeGen/AArch64/mul.ll | 8 +- llvm/test/CodeGen/AArch64/neon-rshrn.ll | 2 +- llvm/test/CodeGen/AArch64/neon-truncstore.ll | 6 +- llvm/test/CodeGen/AArch64/pr-cf624b2.ll | 14 +-- llvm/test/CodeGen/AArch64/sadd_sat_vec.ll | 4 +- llvm/test/CodeGen/AArch64/shufflevector.ll | 4 +- llvm/test/CodeGen/AArch64/ssub_sat_vec.ll | 4 +- llvm/test/CodeGen/AArch64/store.ll | 36 +++---- llvm/test/CodeGen/AArch64/sub.ll | 8 +- .../AArch64/sve-fixed-length-permute-rev.ll | 26 ++--- llvm/test/CodeGen/AArch64/tbl-loops.ll | 2 +- llvm/test/CodeGen/AArch64/trunc-to-tbl.ll | 16 ++-- llvm/test/CodeGen/AArch64/uadd_sat_vec.ll | 4 +- llvm/test/CodeGen/AArch64/usub_sat_vec.ll | 4 +- .../AArch64/vec3-loads-ext-trunc-stores.ll | 94 +++++++++---------- llvm/test/CodeGen/AArch64/vec_uaddo.ll | 2 +- llvm/test/CodeGen/AArch64/vec_umulo.ll | 2 +- llvm/test/CodeGen/AArch64/vector-compress.ll | 4 +- llvm/test/CodeGen/AArch64/zext-to-tbl.ll | 22 ++--- 33 files changed, 227 insertions(+), 223 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index aaa487cc5b0d1..85aa57ae85671 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -7248,8 +7248,15 @@ def : Pat<(v2i64 (int_aarch64_neon_vcopy_lane // Move elements between vectors multiclass Neon_INS_elt_pattern { + ValueType VTScal, Operand SVEIdxTy, Instruction INS, Instruction DUP, SubRegIndex DUPSub> { // Extracting from the lowest 128-bits of an SVE vector + def : Pat<(VT128 (vector_insert undef, + (VTScal (vector_extract VTSVE:$Rm, (i64 SVEIdxTy:$Immn))), + (i64 0))), + (INSERT_SUBREG (VT128 (IMPLICIT_DEF)), + (DUP (VT128 (EXTRACT_SUBREG VTSVE:$Rm, zsub)), SVEIdxTy:$Immn), + DUPSub)>; + def : Pat<(VT128 (vector_insert VT128:$Rn, (VTScal (vector_extract VTSVE:$Rm, (i64 SVEIdxTy:$Immn))), (i64 imm:$Immd))), @@ -7268,6 +7275,11 @@ multiclass Neon_INS_elt_pattern; + def : Pat<(VT128 (vector_insert undef, + (VTScal (vector_extract (VT128 V128:$Rn), (i64 imm:$Immn))), + (i64 0))), + (INSERT_SUBREG (VT128 (IMPLICIT_DEF)), (DUP V128:$Rn, imm:$Immn), DUPSub)>; + def : Pat<(VT128 (vector_insert V128:$src, (VTScal (vector_extract (VT64 V64:$Rn), (i64 imm:$Immn))), (i64 imm:$Immd))), @@ -7290,15 +7302,15 @@ multiclass Neon_INS_elt_pattern; } -defm : Neon_INS_elt_pattern; -defm : Neon_INS_elt_pattern; -defm : Neon_INS_elt_pattern; -defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; -defm : Neon_INS_elt_pattern; -defm : Neon_INS_elt_pattern; -defm : Neon_INS_elt_pattern; -defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; // Insert from bitcast // vector_insert(bitcast(f32 src), n, lane) -> INSvi32lane(src, lane, INSERT_SUBREG(-, n), 0) diff --git a/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll index f5e4d8127800d..6c8fd3c5c6029 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll @@ -8,7 +8,7 @@ target triple = "aarch64-unknown-linux-gnu" define void @test_str_lane_s32(ptr %a, %b) { ; CHECK-NONSTREAMING-LABEL: test_str_lane_s32: ; CHECK-NONSTREAMING: // %bb.0: // %entry -; CHECK-NONSTREAMING-NEXT: mov v0.s[0], v0.s[3] +; CHECK-NONSTREAMING-NEXT: mov s0, v0.s[3] ; CHECK-NONSTREAMING-NEXT: str s0, [x0] ; CHECK-NONSTREAMING-NEXT: ret ; @@ -39,7 +39,7 @@ entry: define void @test_str_lane_s64(ptr %a, %b) { ; CHECK-NONSTREAMING-LABEL: test_str_lane_s64: ; CHECK-NONSTREAMING: // %bb.0: // %entry -; CHECK-NONSTREAMING-NEXT: mov v0.d[0], v0.d[1] +; CHECK-NONSTREAMING-NEXT: mov d0, v0.d[1] ; CHECK-NONSTREAMING-NEXT: str d0, [x0] ; CHECK-NONSTREAMING-NEXT: ret ; @@ -120,7 +120,7 @@ entry: define void @test_str_lane_s8(ptr %a, %b) { ; CHECK-NONSTREAMING-LABEL: test_str_lane_s8: ; CHECK-NONSTREAMING: // %bb.0: // %entry -; CHECK-NONSTREAMING-NEXT: mov v0.b[0], v0.b[7] +; CHECK-NONSTREAMING-NEXT: mov b0, v0.b[7] ; CHECK-NONSTREAMING-NEXT: str b0, [x0] ; CHECK-NONSTREAMING-NEXT: ret ; @@ -149,7 +149,7 @@ entry: define void @test_str_lane_s16(ptr %a, %b) { ; CHECK-NONSTREAMING-LABEL: test_str_lane_s16: ; CHECK-NONSTREAMING: // %bb.0: // %entry -; CHECK-NONSTREAMING-NEXT: mov v0.h[0], v0.h[3] +; CHECK-NONSTREAMING-NEXT: mov h0, v0.h[3] ; CHECK-NONSTREAMING-NEXT: str h0, [x0] ; CHECK-NONSTREAMING-NEXT: ret ; @@ -286,7 +286,7 @@ define void @test_str_reduction_i32_to_i8_negative_offset(ptr %ptr, %b) { ; CHECK-NONSTREAMING-LABEL: test_str_lane_s32_negative_offset: ; CHECK-NONSTREAMING: // %bb.0: // %entry -; CHECK-NONSTREAMING-NEXT: mov v0.s[0], v0.s[3] +; CHECK-NONSTREAMING-NEXT: mov s0, v0.s[3] ; CHECK-NONSTREAMING-NEXT: stur s0, [x0, #-32] ; CHECK-NONSTREAMING-NEXT: ret ; @@ -319,7 +319,7 @@ entry: define void @test_str_lane_s64_negative_offset(ptr %a, %b) { ; CHECK-NONSTREAMING-LABEL: test_str_lane_s64_negative_offset: ; CHECK-NONSTREAMING: // %bb.0: // %entry -; CHECK-NONSTREAMING-NEXT: mov v0.d[0], v0.d[1] +; CHECK-NONSTREAMING-NEXT: mov d0, v0.d[1] ; CHECK-NONSTREAMING-NEXT: stur d0, [x0, #-64] ; CHECK-NONSTREAMING-NEXT: ret ; @@ -352,7 +352,7 @@ entry: define void @test_str_lane_s8_negative_offset(ptr %a, %b) { ; CHECK-NONSTREAMING-LABEL: test_str_lane_s8_negative_offset: ; CHECK-NONSTREAMING: // %bb.0: // %entry -; CHECK-NONSTREAMING-NEXT: mov v0.b[0], v0.b[7] +; CHECK-NONSTREAMING-NEXT: mov b0, v0.b[7] ; CHECK-NONSTREAMING-NEXT: stur b0, [x0, #-8] ; CHECK-NONSTREAMING-NEXT: ret ; @@ -383,7 +383,7 @@ entry: define void @test_str_lane_s16_negative_offset(ptr %a, %b) { ; CHECK-NONSTREAMING-LABEL: test_str_lane_s16_negative_offset: ; CHECK-NONSTREAMING: // %bb.0: // %entry -; CHECK-NONSTREAMING-NEXT: mov v0.h[0], v0.h[3] +; CHECK-NONSTREAMING-NEXT: mov h0, v0.h[3] ; CHECK-NONSTREAMING-NEXT: stur h0, [x0, #-16] ; CHECK-NONSTREAMING-NEXT: ret ; @@ -416,7 +416,7 @@ entry: define void @test_str_trunc_lane_s32_to_s16(ptr %a, %b) { ; CHECK-NONSTREAMING-LABEL: test_str_trunc_lane_s32_to_s16: ; CHECK-NONSTREAMING: // %bb.0: // %entry -; CHECK-NONSTREAMING-NEXT: mov v0.s[0], v0.s[3] +; CHECK-NONSTREAMING-NEXT: mov s0, v0.s[3] ; CHECK-NONSTREAMING-NEXT: str h0, [x0] ; CHECK-NONSTREAMING-NEXT: ret ; @@ -450,7 +450,7 @@ entry: define void @test_str_trunc_lane_s32_to_s8(ptr %a, %b) { ; CHECK-NONSTREAMING-LABEL: test_str_trunc_lane_s32_to_s8: ; CHECK-NONSTREAMING: // %bb.0: // %entry -; CHECK-NONSTREAMING-NEXT: mov v0.s[0], v0.s[3] +; CHECK-NONSTREAMING-NEXT: mov s0, v0.s[3] ; CHECK-NONSTREAMING-NEXT: str b0, [x0] ; CHECK-NONSTREAMING-NEXT: ret ; @@ -496,7 +496,7 @@ entry: define void @test_str_trunc_lane_s32_to_s16_negative_offset(ptr %a, %b) { ; CHECK-NONSTREAMING-LABEL: test_str_trunc_lane_s32_to_s16_negative_offset: ; CHECK-NONSTREAMING: // %bb.0: // %entry -; CHECK-NONSTREAMING-NEXT: mov v0.s[0], v0.s[3] +; CHECK-NONSTREAMING-NEXT: mov s0, v0.s[3] ; CHECK-NONSTREAMING-NEXT: stur h0, [x0, #-16] ; CHECK-NONSTREAMING-NEXT: ret ; @@ -531,7 +531,7 @@ entry: define void @test_str_trunc_lane_s32_to_s8_negative_offset(ptr %a, %b) { ; CHECK-NONSTREAMING-LABEL: test_str_trunc_lane_s32_to_s8_negative_offset: ; CHECK-NONSTREAMING: // %bb.0: // %entry -; CHECK-NONSTREAMING-NEXT: mov v0.s[0], v0.s[3] +; CHECK-NONSTREAMING-NEXT: mov s0, v0.s[3] ; CHECK-NONSTREAMING-NEXT: stur b0, [x0, #-8] ; CHECK-NONSTREAMING-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/add.ll b/llvm/test/CodeGen/AArch64/add.ll index 44e79fb5e1e37..d5bd1b712a2a6 100644 --- a/llvm/test/CodeGen/AArch64/add.ll +++ b/llvm/test/CodeGen/AArch64/add.ll @@ -63,7 +63,7 @@ define void @v2i8(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8] ; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9] ; CHECK-SD-NEXT: add v0.2s, v0.2s, v1.2s -; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] +; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str b0, [x0] ; CHECK-SD-NEXT: stur b1, [x0, #1] ; CHECK-SD-NEXT: ret @@ -100,7 +100,7 @@ define void @v3i8(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v0.8b ; CHECK-SD-NEXT: add v0.4h, v0.4h, v1.4h ; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b -; CHECK-SD-NEXT: mov v0.h[0], v0.h[2] +; CHECK-SD-NEXT: mov h0, v0.h[2] ; CHECK-SD-NEXT: str s1, [sp, #12] ; CHECK-SD-NEXT: ldrh w8, [sp, #12] ; CHECK-SD-NEXT: stur b0, [x0, #2] @@ -231,7 +231,7 @@ define void @v2i16(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8] ; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9] ; CHECK-SD-NEXT: add v0.2s, v0.2s, v1.2s -; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] +; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str h0, [x0] ; CHECK-SD-NEXT: str h1, [x0, #2] ; CHECK-SD-NEXT: ret @@ -263,7 +263,7 @@ define void @v3i16(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: ldr d0, [x0] ; CHECK-SD-NEXT: ldr d1, [x1] ; CHECK-SD-NEXT: add v0.4h, v0.4h, v1.4h -; CHECK-SD-NEXT: mov v1.h[0], v0.h[2] +; CHECK-SD-NEXT: mov h1, v0.h[2] ; CHECK-SD-NEXT: str s0, [x0] ; CHECK-SD-NEXT: str h1, [x0, #4] ; CHECK-SD-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/andorxor.ll b/llvm/test/CodeGen/AArch64/andorxor.ll index 30a7e4aca1c47..f7df1092287bd 100644 --- a/llvm/test/CodeGen/AArch64/andorxor.ll +++ b/llvm/test/CodeGen/AArch64/andorxor.ll @@ -183,7 +183,7 @@ define void @and_v2i8(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8] ; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9] ; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] +; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str b0, [x0] ; CHECK-SD-NEXT: stur b1, [x0, #1] ; CHECK-SD-NEXT: ret @@ -219,7 +219,7 @@ define void @or_v2i8(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8] ; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9] ; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b -; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] +; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str b0, [x0] ; CHECK-SD-NEXT: stur b1, [x0, #1] ; CHECK-SD-NEXT: ret @@ -255,7 +255,7 @@ define void @xor_v2i8(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8] ; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9] ; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b -; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] +; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str b0, [x0] ; CHECK-SD-NEXT: stur b1, [x0, #1] ; CHECK-SD-NEXT: ret @@ -292,7 +292,7 @@ define void @and_v3i8(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v0.8b ; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b -; CHECK-SD-NEXT: mov v0.h[0], v0.h[2] +; CHECK-SD-NEXT: mov h0, v0.h[2] ; CHECK-SD-NEXT: str s1, [sp, #12] ; CHECK-SD-NEXT: ldrh w8, [sp, #12] ; CHECK-SD-NEXT: stur b0, [x0, #2] @@ -340,7 +340,7 @@ define void @or_v3i8(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v0.8b ; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b -; CHECK-SD-NEXT: mov v0.h[0], v0.h[2] +; CHECK-SD-NEXT: mov h0, v0.h[2] ; CHECK-SD-NEXT: str s1, [sp, #12] ; CHECK-SD-NEXT: ldrh w8, [sp, #12] ; CHECK-SD-NEXT: stur b0, [x0, #2] @@ -388,7 +388,7 @@ define void @xor_v3i8(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v0.8b ; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b -; CHECK-SD-NEXT: mov v0.h[0], v0.h[2] +; CHECK-SD-NEXT: mov h0, v0.h[2] ; CHECK-SD-NEXT: str s1, [sp, #12] ; CHECK-SD-NEXT: ldrh w8, [sp, #12] ; CHECK-SD-NEXT: stur b0, [x0, #2] @@ -693,7 +693,7 @@ define void @and_v2i16(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8] ; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9] ; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] +; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str h0, [x0] ; CHECK-SD-NEXT: str h1, [x0, #2] ; CHECK-SD-NEXT: ret @@ -729,7 +729,7 @@ define void @or_v2i16(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8] ; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9] ; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b -; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] +; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str h0, [x0] ; CHECK-SD-NEXT: str h1, [x0, #2] ; CHECK-SD-NEXT: ret @@ -765,7 +765,7 @@ define void @xor_v2i16(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8] ; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9] ; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b -; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] +; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str h0, [x0] ; CHECK-SD-NEXT: str h1, [x0, #2] ; CHECK-SD-NEXT: ret @@ -799,7 +799,7 @@ define void @and_v3i16(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: and x8, x8, x9 ; CHECK-SD-NEXT: fmov d0, x8 ; CHECK-SD-NEXT: str w8, [x0] -; CHECK-SD-NEXT: mov v0.h[0], v0.h[2] +; CHECK-SD-NEXT: mov h0, v0.h[2] ; CHECK-SD-NEXT: str h0, [x0, #4] ; CHECK-SD-NEXT: ret ; @@ -836,7 +836,7 @@ define void @or_v3i16(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: orr x8, x8, x9 ; CHECK-SD-NEXT: fmov d0, x8 ; CHECK-SD-NEXT: str w8, [x0] -; CHECK-SD-NEXT: mov v0.h[0], v0.h[2] +; CHECK-SD-NEXT: mov h0, v0.h[2] ; CHECK-SD-NEXT: str h0, [x0, #4] ; CHECK-SD-NEXT: ret ; @@ -873,7 +873,7 @@ define void @xor_v3i16(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: eor x8, x8, x9 ; CHECK-SD-NEXT: fmov d0, x8 ; CHECK-SD-NEXT: str w8, [x0] -; CHECK-SD-NEXT: mov v0.h[0], v0.h[2] +; CHECK-SD-NEXT: mov h0, v0.h[2] ; CHECK-SD-NEXT: str h0, [x0, #4] ; CHECK-SD-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll index c0d91c1e0c836..2a085dc0e72bf 100644 --- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll @@ -2062,7 +2062,7 @@ define <8 x i16> @test_concat_v8i16_v8i16_v4i16(<8 x i16> %x, <4 x i16> %y) #0 { ; ; CHECK-GI-LABEL: test_concat_v8i16_v8i16_v4i16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov v2.h[0], v0.h[0] +; CHECK-GI-NEXT: mov h2, v0.h[0] ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-GI-NEXT: mov v2.h[1], v0.h[1] ; CHECK-GI-NEXT: mov v2.h[2], v0.h[2] @@ -2189,7 +2189,7 @@ define <4 x i32> @test_concat_v4i32_v4i32_v2i32(<4 x i32> %x, <2 x i32> %y) #0 { ; ; CHECK-GI-LABEL: test_concat_v4i32_v4i32_v2i32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov v2.s[0], v0.s[0] +; CHECK-GI-NEXT: mov s2, v0.s[0] ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-GI-NEXT: mov v2.s[1], v0.s[1] ; CHECK-GI-NEXT: mov v2.s[2], v1.s[0] @@ -2252,7 +2252,7 @@ define <2 x i64> @test_concat_v2i64_v2i64_v1i64(<2 x i64> %x, <1 x i64> %y) #0 { ; ; CHECK-GI-LABEL: test_concat_v2i64_v2i64_v1i64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov v0.d[0], v0.d[0] +; CHECK-GI-NEXT: mov d0, v0.d[0] ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] ; CHECK-GI-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/arm64-rev.ll b/llvm/test/CodeGen/AArch64/arm64-rev.ll index 2006e5af547c1..6bdd5f998a3b9 100644 --- a/llvm/test/CodeGen/AArch64/arm64-rev.ll +++ b/llvm/test/CodeGen/AArch64/arm64-rev.ll @@ -462,7 +462,7 @@ define void @test_vrev64(ptr nocapture %source, ptr nocapture %dst) nounwind ssp ; CHECK-SD-LABEL: test_vrev64: ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: ldr q0, [x0] -; CHECK-SD-NEXT: mov.h v1[0], v0[5] +; CHECK-SD-NEXT: mov h1, v0[5] ; CHECK-SD-NEXT: st1.h { v0 }[6], [x1] ; CHECK-SD-NEXT: str h1, [x1, #2] ; CHECK-SD-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/arm64-st1.ll b/llvm/test/CodeGen/AArch64/arm64-st1.ll index a4bf5c6e4d5b3..d6abf829bc989 100644 --- a/llvm/test/CodeGen/AArch64/arm64-st1.ll +++ b/llvm/test/CodeGen/AArch64/arm64-st1.ll @@ -6,7 +6,7 @@ define void @st1lane_16b(<16 x i8> %A, ptr %D) { ; SD-CHECK-LABEL: st1lane_16b: -; SD-CHECK: mov.b v0[0], v0[1] +; SD-CHECK: mov b0, v0[1] ; SD-CHECK: stur b0, [x0, #1] ; ; GI-CHECK-LABEL: st1lane_16b: @@ -14,7 +14,7 @@ define void @st1lane_16b(<16 x i8> %A, ptr %D) { ; GI-CHECK: st1.b { v0 }[1], [x8] ; ; EXYNOS-LABEL: st1lane_16b: -; EXYNOS: mov.b v0[0], v0[1] +; EXYNOS: mov b0, v0[1] ; EXYNOS: stur b0, [x0, #1] %ptr = getelementptr i8, ptr %D, i64 1 %tmp = extractelement <16 x i8> %A, i32 1 @@ -57,7 +57,7 @@ define void @st1lane0u_16b(<16 x i8> %A, ptr %D) { define void @st1lane_ro_16b(<16 x i8> %A, ptr %D, i64 %offset) { ; SD-CHECK-LABEL: st1lane_ro_16b: -; SD-CHECK: mov.b v0[0], v0[1] +; SD-CHECK: mov b0, v0[1] ; SD-CHECK: str b0, [x0, x1] ; ; GI-CHECK-LABEL: st1lane_ro_16b: @@ -65,7 +65,7 @@ define void @st1lane_ro_16b(<16 x i8> %A, ptr %D, i64 %offset) { ; GI-CHECK: st1.b { v0 }[1], [x8] ; ; EXYNOS-LABEL: st1lane_ro_16b: -; EXYNOS: mov.b v0[0], v0[1] +; EXYNOS: mov b0, v0[1] ; EXYNOS: str b0, [x0, x1] %ptr = getelementptr i8, ptr %D, i64 %offset %tmp = extractelement <16 x i8> %A, i32 1 @@ -91,7 +91,7 @@ define void @st1lane0_ro_16b(<16 x i8> %A, ptr %D, i64 %offset) { define void @st1lane_8h(<8 x i16> %A, ptr %D) { ; SD-CHECK-LABEL: st1lane_8h: -; SD-CHECK: mov.h v0[0], v0[1] +; SD-CHECK: mov h0, v0[1] ; SD-CHECK: str h0, [x0, #2] ; ; GI-CHECK-LABEL: st1lane_8h: @@ -99,7 +99,7 @@ define void @st1lane_8h(<8 x i16> %A, ptr %D) { ; GI-CHECK: st1.h { v0 }[1], [x8] ; ; EXYNOS-LABEL: st1lane_8h: -; EXYNOS: mov.h v0[0], v0[1] +; EXYNOS: mov h0, v0[1] ; EXYNOS: str h0, [x0, #2] %ptr = getelementptr i16, ptr %D, i64 1 %tmp = extractelement <8 x i16> %A, i32 1 @@ -127,7 +127,7 @@ define void @st1lane0u_8h(<8 x i16> %A, ptr %D) { define void @st1lane_ro_8h(<8 x i16> %A, ptr %D, i64 %offset) { ; SD-CHECK-LABEL: st1lane_ro_8h: -; SD-CHECK: mov.h v0[0], v0[1] +; SD-CHECK: mov h0, v0[1] ; SD-CHECK: str h0, [x0, x1, lsl #1] ; ; GI-CHECK-LABEL: st1lane_ro_8h: @@ -135,7 +135,7 @@ define void @st1lane_ro_8h(<8 x i16> %A, ptr %D, i64 %offset) { ; GI-CHECK: st1.h { v0 }[1], [x8] ; ; EXYNOS-LABEL: st1lane_ro_8h: -; EXYNOS: mov.h v0[0], v0[1] +; EXYNOS: mov h0, v0[1] ; EXYNOS: str h0, [x0, x1, lsl #1] %ptr = getelementptr i16, ptr %D, i64 %offset %tmp = extractelement <8 x i16> %A, i32 1 @@ -154,7 +154,7 @@ define void @st1lane0_ro_8h(<8 x i16> %A, ptr %D, i64 %offset) { define void @st1lane_4s(<4 x i32> %A, ptr %D) { ; SD-CHECK-LABEL: st1lane_4s: -; SD-CHECK: mov.s v0[0], v0[1] +; SD-CHECK: mov s0, v0[1] ; SD-CHECK: str s0, [x0, #4] ; ; GI-CHECK-LABEL: st1lane_4s: @@ -162,7 +162,7 @@ define void @st1lane_4s(<4 x i32> %A, ptr %D) { ; GI-CHECK: st1.s { v0 }[1], [x8] ; ; EXYNOS-LABEL: st1lane_4s: -; EXYNOS: mov.s v0[0], v0[1] +; EXYNOS: mov s0, v0[1] ; EXYNOS: str s0, [x0, #4] %ptr = getelementptr i32, ptr %D, i64 1 %tmp = extractelement <4 x i32> %A, i32 1 @@ -190,7 +190,7 @@ define void @st1lane0u_4s(<4 x i32> %A, ptr %D) { define void @st1lane_ro_4s(<4 x i32> %A, ptr %D, i64 %offset) { ; SD-CHECK-LABEL: st1lane_ro_4s: -; SD-CHECK: mov.s v0[0], v0[1] +; SD-CHECK: mov s0, v0[1] ; SD-CHECK: str s0, [x0, x1, lsl #2] ; ; GI-CHECK-LABEL: st1lane_ro_4s: @@ -198,7 +198,7 @@ define void @st1lane_ro_4s(<4 x i32> %A, ptr %D, i64 %offset) { ; GI-CHECK: st1.s { v0 }[1], [x8] ; ; EXYNOS-LABEL: st1lane_ro_4s: -; EXYNOS: mov.s v0[0], v0[1] +; EXYNOS: mov s0, v0[1] ; EXYNOS: str s0, [x0, x1, lsl #2] %ptr = getelementptr i32, ptr %D, i64 %offset %tmp = extractelement <4 x i32> %A, i32 1 @@ -264,7 +264,7 @@ define void @st1lane0_ro_4s_float(<4 x float> %A, ptr %D, i64 %offset) { define void @st1lane_2d(<2 x i64> %A, ptr %D) { ; SD-CHECK-LABEL: st1lane_2d: -; SD-CHECK: mov.d v0[0], v0[1] +; SD-CHECK: mov d0, v0[1] ; SD-CHECK: str d0, [x0, #8] ; ; GI-CHECK-LABEL: st1lane_2d: @@ -272,7 +272,7 @@ define void @st1lane_2d(<2 x i64> %A, ptr %D) { ; GI-CHECK: st1.d { v0 }[1], [x8] ; ; EXYNOS-LABEL: st1lane_2d: -; EXYNOS: mov.d v0[0], v0[1] +; EXYNOS: mov d0, v0[1] ; EXYNOS: str d0, [x0, #8] %ptr = getelementptr i64, ptr %D, i64 1 %tmp = extractelement <2 x i64> %A, i32 1 @@ -300,7 +300,7 @@ define void @st1lane0u_2d(<2 x i64> %A, ptr %D) { define void @st1lane_ro_2d(<2 x i64> %A, ptr %D, i64 %offset) { ; SD-CHECK-LABEL: st1lane_ro_2d: -; SD-CHECK: mov.d v0[0], v0[1] +; SD-CHECK: mov d0, v0[1] ; SD-CHECK: str d0, [x0, x1, lsl #3] ; ; GI-CHECK-LABEL: st1lane_ro_2d: @@ -308,7 +308,7 @@ define void @st1lane_ro_2d(<2 x i64> %A, ptr %D, i64 %offset) { ; GI-CHECK: st1.d { v0 }[1], [x8] ; ; EXYNOS-LABEL: st1lane_ro_2d: -; EXYNOS: mov.d v0[0], v0[1] +; EXYNOS: mov d0, v0[1] ; EXYNOS: str d0, [x0, x1, lsl #3] %ptr = getelementptr i64, ptr %D, i64 %offset %tmp = extractelement <2 x i64> %A, i32 1 @@ -374,7 +374,7 @@ define void @st1lane0_ro_2d_double(<2 x double> %A, ptr %D, i64 %offset) { define void @st1lane_8b(<8 x i8> %A, ptr %D) { ; SD-CHECK-LABEL: st1lane_8b: -; SD-CHECK: mov.b v0[0], v0[1] +; SD-CHECK: mov b0, v0[1] ; SD-CHECK: stur b0, [x0, #1] ; ; GI-CHECK-LABEL: st1lane_8b: @@ -382,7 +382,7 @@ define void @st1lane_8b(<8 x i8> %A, ptr %D) { ; GI-CHECK: st1.b { v0 }[1], [x8] ; ; EXYNOS-LABEL: st1lane_8b: -; EXYNOS: mov.b v0[0], v0[1] +; EXYNOS: mov b0, v0[1] ; EXYNOS: stur b0, [x0, #1] %ptr = getelementptr i8, ptr %D, i64 1 %tmp = extractelement <8 x i8> %A, i32 1 @@ -392,7 +392,7 @@ define void @st1lane_8b(<8 x i8> %A, ptr %D) { define void @st1lane_ro_8b(<8 x i8> %A, ptr %D, i64 %offset) { ; SD-CHECK-LABEL: st1lane_ro_8b: -; SD-CHECK: mov.b v0[0], v0[1] +; SD-CHECK: mov b0, v0[1] ; SD-CHECK: str b0, [x0, x1] ; ; GI-CHECK-LABEL: st1lane_ro_8b: @@ -400,7 +400,7 @@ define void @st1lane_ro_8b(<8 x i8> %A, ptr %D, i64 %offset) { ; GI-CHECK: st1.b { v0 }[1], [x8] ; ; EXYNOS-LABEL: st1lane_ro_8b: -; EXYNOS: mov.b v0[0], v0[1] +; EXYNOS: mov b0, v0[1] ; EXYNOS: str b0, [x0, x1] %ptr = getelementptr i8, ptr %D, i64 %offset %tmp = extractelement <8 x i8> %A, i32 1 @@ -426,7 +426,7 @@ define void @st1lane0_ro_8b(<8 x i8> %A, ptr %D, i64 %offset) { define void @st1lane_4h(<4 x i16> %A, ptr %D) { ; SD-CHECK-LABEL: st1lane_4h: -; SD-CHECK: mov.h v0[0], v0[1] +; SD-CHECK: mov h0, v0[1] ; SD-CHECK: str h0, [x0, #2] ; ; GI-CHECK-LABEL: st1lane_4h: @@ -434,7 +434,7 @@ define void @st1lane_4h(<4 x i16> %A, ptr %D) { ; GI-CHECK: st1.h { v0 }[1], [x8] ; ; EXYNOS-LABEL: st1lane_4h: -; EXYNOS: mov.h v0[0], v0[1] +; EXYNOS: mov h0, v0[1] ; EXYNOS: str h0, [x0, #2] %ptr = getelementptr i16, ptr %D, i64 1 %tmp = extractelement <4 x i16> %A, i32 1 @@ -462,7 +462,7 @@ define void @st1lane0u_4h(<4 x i16> %A, ptr %D) { define void @st1lane_ro_4h(<4 x i16> %A, ptr %D, i64 %offset) { ; SD-CHECK-LABEL: st1lane_ro_4h: -; SD-CHECK: mov.h v0[0], v0[1] +; SD-CHECK: mov h0, v0[1] ; SD-CHECK: str h0, [x0, x1, lsl #1] ; ; GI-CHECK-LABEL: st1lane_ro_4h: @@ -470,7 +470,7 @@ define void @st1lane_ro_4h(<4 x i16> %A, ptr %D, i64 %offset) { ; GI-CHECK: st1.h { v0 }[1], [x8] ; ; EXYNOS-LABEL: st1lane_ro_4h: -; EXYNOS: mov.h v0[0], v0[1] +; EXYNOS: mov h0, v0[1] ; EXYNOS: str h0, [x0, x1, lsl #1] %ptr = getelementptr i16, ptr %D, i64 %offset %tmp = extractelement <4 x i16> %A, i32 1 @@ -489,7 +489,7 @@ define void @st1lane0_ro_4h(<4 x i16> %A, ptr %D, i64 %offset) { define void @st1lane_2s(<2 x i32> %A, ptr %D) { ; SD-CHECK-LABEL: st1lane_2s: -; SD-CHECK: mov.s v0[0], v0[1] +; SD-CHECK: mov s0, v0[1] ; SD-CHECK: str s0, [x0, #4] ; ; GI-CHECK-LABEL: st1lane_2s: @@ -497,7 +497,7 @@ define void @st1lane_2s(<2 x i32> %A, ptr %D) { ; GI-CHECK: st1.s { v0 }[1], [x8] ; ; EXYNOS-LABEL: st1lane_2s: -; EXYNOS: mov.s v0[0], v0[1] +; EXYNOS: mov s0, v0[1] ; EXYNOS: str s0, [x0, #4] %ptr = getelementptr i32, ptr %D, i64 1 %tmp = extractelement <2 x i32> %A, i32 1 @@ -525,7 +525,7 @@ define void @st1lane0u_2s(<2 x i32> %A, ptr %D) { define void @st1lane_ro_2s(<2 x i32> %A, ptr %D, i64 %offset) { ; SD-CHECK-LABEL: st1lane_ro_2s: -; SD-CHECK: mov.s v0[0], v0[1] +; SD-CHECK: mov s0, v0[1] ; SD-CHECK: str s0, [x0, x1, lsl #2] ; ; GI-CHECK-LABEL: st1lane_ro_2s: @@ -533,7 +533,7 @@ define void @st1lane_ro_2s(<2 x i32> %A, ptr %D, i64 %offset) { ; GI-CHECK: st1.s { v0 }[1], [x8] ; ; EXYNOS-LABEL: st1lane_ro_2s: -; EXYNOS: mov.s v0[0], v0[1] +; EXYNOS: mov s0, v0[1] ; EXYNOS: str s0, [x0, x1, lsl #2] %ptr = getelementptr i32, ptr %D, i64 %offset %tmp = extractelement <2 x i32> %A, i32 1 diff --git a/llvm/test/CodeGen/AArch64/bitcast-v2i8.ll b/llvm/test/CodeGen/AArch64/bitcast-v2i8.ll index 05f66e4b03ed2..2866214e1e473 100644 --- a/llvm/test/CodeGen/AArch64/bitcast-v2i8.ll +++ b/llvm/test/CodeGen/AArch64/bitcast-v2i8.ll @@ -9,7 +9,7 @@ define i16 @test_bitcast_v2i8_to_i16(<2 x i8> %a) { ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov.s v1[0], v0[1] +; CHECK-NEXT: mov s1, v0[1] ; CHECK-NEXT: str b0, [sp, #14] ; CHECK-NEXT: stur b1, [sp, #15] ; CHECK-NEXT: ldrh w0, [sp, #14] diff --git a/llvm/test/CodeGen/AArch64/bitcast.ll b/llvm/test/CodeGen/AArch64/bitcast.ll index adda3b08af66c..d54cc4adb81b3 100644 --- a/llvm/test/CodeGen/AArch64/bitcast.ll +++ b/llvm/test/CodeGen/AArch64/bitcast.ll @@ -102,7 +102,7 @@ define i32 @bitcast_v2i16_i32(<2 x i16> %a, <2 x i16> %b){ ; CHECK-SD-NEXT: sub sp, sp, #16 ; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 ; CHECK-SD-NEXT: add v0.2s, v0.2s, v1.2s -; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] +; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str h0, [sp, #12] ; CHECK-SD-NEXT: str h1, [sp, #14] ; CHECK-SD-NEXT: ldr w0, [sp, #12] @@ -399,7 +399,7 @@ define <4 x i8> @bitcast_v2i16_v4i8(<2 x i16> %a, <2 x i16> %b){ ; CHECK-SD-NEXT: sub sp, sp, #16 ; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 ; CHECK-SD-NEXT: add v0.2s, v0.2s, v1.2s -; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] +; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str h0, [sp, #12] ; CHECK-SD-NEXT: str h1, [sp, #14] ; CHECK-SD-NEXT: ldr s0, [sp, #12] diff --git a/llvm/test/CodeGen/AArch64/concat-vector.ll b/llvm/test/CodeGen/AArch64/concat-vector.ll index 0daa6e7f16202..05a40453833ee 100644 --- a/llvm/test/CodeGen/AArch64/concat-vector.ll +++ b/llvm/test/CodeGen/AArch64/concat-vector.ll @@ -480,7 +480,7 @@ define <2 x i64> @concat_high_high_v2i64(<2 x i64> %a_vec, <2 x i64> %b_vec) { ; ; CHECK-GI-LABEL: concat_high_high_v2i64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov v0.d[0], v0.d[1] +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: mov v0.d[1], v1.d[1] ; CHECK-GI-NEXT: ret entry: @@ -498,7 +498,7 @@ define <2 x double> @concat_high_high_v2f64(<2 x double> %a_vec, <2 x double> %b ; ; CHECK-GI-LABEL: concat_high_high_v2f64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov v0.d[0], v0.d[1] +; CHECK-GI-NEXT: mov d0, v0.d[1] ; CHECK-GI-NEXT: mov v0.d[1], v1.d[1] ; CHECK-GI-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/ctlz.ll b/llvm/test/CodeGen/AArch64/ctlz.ll index f941ecf508055..fcd1fa2983420 100644 --- a/llvm/test/CodeGen/AArch64/ctlz.ll +++ b/llvm/test/CodeGen/AArch64/ctlz.ll @@ -13,7 +13,7 @@ define void @v2i8(ptr %p1) { ; CHECK-SD-NEXT: mov v1.s[1], w9 ; CHECK-SD-NEXT: clz v1.2s, v1.2s ; CHECK-SD-NEXT: sub v0.2s, v1.2s, v0.2s -; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] +; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str b0, [x0] ; CHECK-SD-NEXT: stur b1, [x0, #1] ; CHECK-SD-NEXT: ret @@ -48,7 +48,7 @@ define void @v3i8(ptr %p1) { ; CHECK-SD-NEXT: clz v1.4h, v1.4h ; CHECK-SD-NEXT: sub v0.4h, v1.4h, v0.4h ; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b -; CHECK-SD-NEXT: mov v0.h[0], v0.h[2] +; CHECK-SD-NEXT: mov h0, v0.h[2] ; CHECK-SD-NEXT: str s1, [sp, #12] ; CHECK-SD-NEXT: ldrh w8, [sp, #12] ; CHECK-SD-NEXT: stur b0, [x0, #2] @@ -153,7 +153,7 @@ define void @v2i16(ptr %p1) { ; CHECK-SD-NEXT: mov v1.s[1], w9 ; CHECK-SD-NEXT: clz v1.2s, v1.2s ; CHECK-SD-NEXT: sub v0.2s, v1.2s, v0.2s -; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] +; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str h0, [x0] ; CHECK-SD-NEXT: str h1, [x0, #2] ; CHECK-SD-NEXT: ret @@ -179,7 +179,7 @@ define void @v3i16(ptr %p1) { ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: ldr d0, [x0] ; CHECK-SD-NEXT: clz v0.4h, v0.4h -; CHECK-SD-NEXT: mov v1.h[0], v0.h[2] +; CHECK-SD-NEXT: mov h1, v0.h[2] ; CHECK-SD-NEXT: str s0, [x0] ; CHECK-SD-NEXT: str h1, [x0, #4] ; CHECK-SD-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/ctpop.ll b/llvm/test/CodeGen/AArch64/ctpop.ll index b9671114508db..10ec1d0c1982a 100644 --- a/llvm/test/CodeGen/AArch64/ctpop.ll +++ b/llvm/test/CodeGen/AArch64/ctpop.ll @@ -13,7 +13,7 @@ define void @v2i8(ptr %p1) { ; CHECK-SD-NEXT: cnt v0.8b, v0.8b ; CHECK-SD-NEXT: uaddlp v0.4h, v0.8b ; CHECK-SD-NEXT: uaddlp v0.2s, v0.4h -; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] +; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str b0, [x0] ; CHECK-SD-NEXT: stur b1, [x0, #1] ; CHECK-SD-NEXT: ret @@ -47,7 +47,7 @@ define void @v3i8(ptr %p1) { ; CHECK-SD-NEXT: cnt v0.8b, v0.8b ; CHECK-SD-NEXT: uaddlp v0.4h, v0.8b ; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b -; CHECK-SD-NEXT: mov v0.h[0], v0.h[2] +; CHECK-SD-NEXT: mov h0, v0.h[2] ; CHECK-SD-NEXT: str s1, [sp, #12] ; CHECK-SD-NEXT: ldrh w8, [sp, #12] ; CHECK-SD-NEXT: stur b0, [x0, #2] @@ -151,7 +151,7 @@ define void @v2i16(ptr %p1) { ; CHECK-SD-NEXT: cnt v0.8b, v0.8b ; CHECK-SD-NEXT: uaddlp v0.4h, v0.8b ; CHECK-SD-NEXT: uaddlp v0.2s, v0.4h -; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] +; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str h0, [x0] ; CHECK-SD-NEXT: str h1, [x0, #2] ; CHECK-SD-NEXT: ret @@ -179,7 +179,7 @@ define void @v3i16(ptr %p1) { ; CHECK-SD-NEXT: ldr d0, [x0] ; CHECK-SD-NEXT: cnt v0.8b, v0.8b ; CHECK-SD-NEXT: uaddlp v0.4h, v0.8b -; CHECK-SD-NEXT: mov v1.h[0], v0.h[2] +; CHECK-SD-NEXT: mov h1, v0.h[2] ; CHECK-SD-NEXT: str s0, [x0] ; CHECK-SD-NEXT: str h1, [x0, #4] ; CHECK-SD-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/cttz.ll b/llvm/test/CodeGen/AArch64/cttz.ll index 03e89c04b184f..60125f8a19811 100644 --- a/llvm/test/CodeGen/AArch64/cttz.ll +++ b/llvm/test/CodeGen/AArch64/cttz.ll @@ -16,7 +16,7 @@ define void @v2i8(ptr %p1) { ; CHECK-SD-NEXT: movi v1.2s, #32 ; CHECK-SD-NEXT: clz v0.2s, v0.2s ; CHECK-SD-NEXT: sub v0.2s, v1.2s, v0.2s -; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] +; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str b0, [x0] ; CHECK-SD-NEXT: stur b1, [x0, #1] ; CHECK-SD-NEXT: ret @@ -58,7 +58,7 @@ define void @v3i8(ptr %p1) { ; CHECK-SD-NEXT: clz v0.4h, v0.4h ; CHECK-SD-NEXT: sub v0.4h, v1.4h, v0.4h ; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b -; CHECK-SD-NEXT: mov v0.h[0], v0.h[2] +; CHECK-SD-NEXT: mov h0, v0.h[2] ; CHECK-SD-NEXT: str s1, [sp, #12] ; CHECK-SD-NEXT: ldrh w8, [sp, #12] ; CHECK-SD-NEXT: stur b0, [x0, #2] @@ -227,7 +227,7 @@ define void @v2i16(ptr %p1) { ; CHECK-SD-NEXT: movi v1.2s, #32 ; CHECK-SD-NEXT: clz v0.2s, v0.2s ; CHECK-SD-NEXT: sub v0.2s, v1.2s, v0.2s -; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] +; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str h0, [x0] ; CHECK-SD-NEXT: str h1, [x0, #2] ; CHECK-SD-NEXT: ret @@ -267,7 +267,7 @@ define void @v3i16(ptr %p1) { ; CHECK-SD-NEXT: movi v1.4h, #16 ; CHECK-SD-NEXT: clz v0.4h, v0.4h ; CHECK-SD-NEXT: sub v0.4h, v1.4h, v0.4h -; CHECK-SD-NEXT: mov v1.h[0], v0.h[2] +; CHECK-SD-NEXT: mov h1, v0.h[2] ; CHECK-SD-NEXT: str s0, [x0] ; CHECK-SD-NEXT: str h1, [x0, #4] ; CHECK-SD-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/insertextract.ll b/llvm/test/CodeGen/AArch64/insertextract.ll index 54ee693db1239..aa4f31fb5f53e 100644 --- a/llvm/test/CodeGen/AArch64/insertextract.ll +++ b/llvm/test/CodeGen/AArch64/insertextract.ll @@ -271,7 +271,7 @@ define <3 x float> @insert_v3f32_2(<3 x float> %a, float %b, i32 %c) { ; ; CHECK-GI-LABEL: insert_v3f32_2: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov v2.s[0], v0.s[0] +; CHECK-GI-NEXT: mov s2, v0.s[0] ; CHECK-GI-NEXT: // kill: def $s1 killed $s1 def $q1 ; CHECK-GI-NEXT: mov v2.s[1], v0.s[1] ; CHECK-GI-NEXT: mov v2.s[2], v1.s[0] @@ -992,7 +992,7 @@ define <3 x i32> @insert_v3i32_2(<3 x i32> %a, i32 %b, i32 %c) { ; ; CHECK-GI-LABEL: insert_v3i32_2: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NEXT: mov s1, v0.s[0] ; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] ; CHECK-GI-NEXT: mov v1.s[2], w0 ; CHECK-GI-NEXT: mov v0.16b, v1.16b diff --git a/llvm/test/CodeGen/AArch64/mul.ll b/llvm/test/CodeGen/AArch64/mul.ll index ae607ffb56c3e..8d9a6e6b92914 100644 --- a/llvm/test/CodeGen/AArch64/mul.ll +++ b/llvm/test/CodeGen/AArch64/mul.ll @@ -75,7 +75,7 @@ define void @v2i8(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8] ; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9] ; CHECK-SD-NEXT: mul v0.2s, v0.2s, v1.2s -; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] +; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str b0, [x0] ; CHECK-SD-NEXT: stur b1, [x0, #1] ; CHECK-SD-NEXT: ret @@ -112,7 +112,7 @@ define void @v3i8(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v0.8b ; CHECK-SD-NEXT: mul v0.4h, v0.4h, v1.4h ; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b -; CHECK-SD-NEXT: mov v0.h[0], v0.h[2] +; CHECK-SD-NEXT: mov h0, v0.h[2] ; CHECK-SD-NEXT: str s1, [sp, #12] ; CHECK-SD-NEXT: ldrh w8, [sp, #12] ; CHECK-SD-NEXT: stur b0, [x0, #2] @@ -243,7 +243,7 @@ define void @v2i16(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8] ; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9] ; CHECK-SD-NEXT: mul v0.2s, v0.2s, v1.2s -; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] +; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str h0, [x0] ; CHECK-SD-NEXT: str h1, [x0, #2] ; CHECK-SD-NEXT: ret @@ -275,7 +275,7 @@ define void @v3i16(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: ldr d0, [x0] ; CHECK-SD-NEXT: ldr d1, [x1] ; CHECK-SD-NEXT: mul v0.4h, v0.4h, v1.4h -; CHECK-SD-NEXT: mov v1.h[0], v0.h[2] +; CHECK-SD-NEXT: mov h1, v0.h[2] ; CHECK-SD-NEXT: str s0, [x0] ; CHECK-SD-NEXT: str h1, [x0, #4] ; CHECK-SD-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/neon-rshrn.ll b/llvm/test/CodeGen/AArch64/neon-rshrn.ll index cbfa3f82f21b1..8fabd7a618f68 100644 --- a/llvm/test/CodeGen/AArch64/neon-rshrn.ll +++ b/llvm/test/CodeGen/AArch64/neon-rshrn.ll @@ -868,7 +868,7 @@ define void @rshrn_v2i32_4(<2 x i32> %a, ptr %p) { ; CHECK-NEXT: movi v1.2s, #8 ; CHECK-NEXT: add v0.2s, v0.2s, v1.2s ; CHECK-NEXT: ushr v0.2s, v0.2s, #4 -; CHECK-NEXT: mov v1.s[0], v0.s[1] +; CHECK-NEXT: mov s1, v0.s[1] ; CHECK-NEXT: str h0, [x0] ; CHECK-NEXT: str h1, [x0, #2] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/neon-truncstore.ll b/llvm/test/CodeGen/AArch64/neon-truncstore.ll index 86808ae7f9349..f25c90af7968e 100644 --- a/llvm/test/CodeGen/AArch64/neon-truncstore.ll +++ b/llvm/test/CodeGen/AArch64/neon-truncstore.ll @@ -42,7 +42,7 @@ define void @v2i32_v2i16(<2 x i32> %a, ptr %result) { ; CHECK-LABEL: v2i32_v2i16: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov v1.s[0], v0.s[1] +; CHECK-NEXT: mov s1, v0.s[1] ; CHECK-NEXT: str h0, [x0] ; CHECK-NEXT: str h1, [x0, #2] ; CHECK-NEXT: ret @@ -89,7 +89,7 @@ define void @v2i32_v2i8(<2 x i32> %a, ptr %result) { ; CHECK-LABEL: v2i32_v2i8: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov v1.s[0], v0.s[1] +; CHECK-NEXT: mov s1, v0.s[1] ; CHECK-NEXT: str b0, [x0] ; CHECK-NEXT: stur b1, [x0, #1] ; CHECK-NEXT: ret @@ -155,7 +155,7 @@ define void @v2i16_v2i8(<2 x i16> %a, ptr %result) { ; CHECK-LABEL: v2i16_v2i8: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov v1.s[0], v0.s[1] +; CHECK-NEXT: mov s1, v0.s[1] ; CHECK-NEXT: str b0, [x0] ; CHECK-NEXT: stur b1, [x0, #1] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/pr-cf624b2.ll b/llvm/test/CodeGen/AArch64/pr-cf624b2.ll index f17570837515c..02375b07b3482 100644 --- a/llvm/test/CodeGen/AArch64/pr-cf624b2.ll +++ b/llvm/test/CodeGen/AArch64/pr-cf624b2.ll @@ -12,25 +12,25 @@ define linkonce_odr void @_ZN1y2beEPiRK1vPmPS1_(<8 x i8> %0, ptr %agg.tmp.i) { ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov v1.b[0], v0.b[7] -; CHECK-NEXT: mov v2.b[0], v0.b[6] +; CHECK-NEXT: mov b1, v0.b[7] +; CHECK-NEXT: mov b2, v0.b[6] ; CHECK-NEXT: stur b0, [sp, #15] ; CHECK-NEXT: stur b0, [sp, #14] ; CHECK-NEXT: stur b0, [sp, #13] ; CHECK-NEXT: stur b0, [sp, #12] ; CHECK-NEXT: stur b1, [sp, #7] -; CHECK-NEXT: mov v1.b[0], v0.b[5] +; CHECK-NEXT: mov b1, v0.b[5] ; CHECK-NEXT: stur b2, [sp, #6] -; CHECK-NEXT: mov v2.b[0], v0.b[4] +; CHECK-NEXT: mov b2, v0.b[4] ; CHECK-NEXT: stur b0, [sp, #11] ; CHECK-NEXT: stur b0, [sp, #10] ; CHECK-NEXT: stur b1, [sp, #5] -; CHECK-NEXT: mov v1.b[0], v0.b[3] +; CHECK-NEXT: mov b1, v0.b[3] ; CHECK-NEXT: stur b0, [sp, #9] ; CHECK-NEXT: stur b2, [sp, #4] -; CHECK-NEXT: mov v2.b[0], v0.b[2] +; CHECK-NEXT: mov b2, v0.b[2] ; CHECK-NEXT: str b0, [sp] -; CHECK-NEXT: mov v0.b[0], v0.b[1] +; CHECK-NEXT: mov b0, v0.b[1] ; CHECK-NEXT: stur b1, [sp, #3] ; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: stur b2, [sp, #2] diff --git a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll index 0a47ced6c05f0..d54dde3c86364 100644 --- a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll @@ -200,7 +200,7 @@ define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-SD-NEXT: shl v0.2s, v0.2s, #24 ; CHECK-SD-NEXT: sqadd v0.2s, v0.2s, v1.2s ; CHECK-SD-NEXT: ushr v0.2s, v0.2s, #24 -; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] +; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str b0, [x2] ; CHECK-SD-NEXT: stur b1, [x2, #1] ; CHECK-SD-NEXT: ret @@ -255,7 +255,7 @@ define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-SD-NEXT: shl v0.2s, v0.2s, #16 ; CHECK-SD-NEXT: sqadd v0.2s, v0.2s, v1.2s ; CHECK-SD-NEXT: ushr v0.2s, v0.2s, #16 -; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] +; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str h0, [x2] ; CHECK-SD-NEXT: str h1, [x2, #2] ; CHECK-SD-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/shufflevector.ll b/llvm/test/CodeGen/AArch64/shufflevector.ll index 6bfb89fe541c8..4c8f0c9c446f5 100644 --- a/llvm/test/CodeGen/AArch64/shufflevector.ll +++ b/llvm/test/CodeGen/AArch64/shufflevector.ll @@ -288,7 +288,7 @@ define i32 @shufflevector_v2i16(<2 x i16> %a, <2 x i16> %b){ ; CHECK-SD-NEXT: sub sp, sp, #16 ; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 ; CHECK-SD-NEXT: ext v0.8b, v0.8b, v1.8b, #4 -; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] +; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str h0, [sp, #12] ; CHECK-SD-NEXT: str h1, [sp, #14] ; CHECK-SD-NEXT: ldr w0, [sp, #12] @@ -499,7 +499,7 @@ define i32 @shufflevector_v2i16_zeroes(<2 x i16> %a, <2 x i16> %b){ ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-SD-NEXT: dup v1.2s, v0.s[0] ; CHECK-SD-NEXT: str h0, [sp, #12] -; CHECK-SD-NEXT: mov v1.s[0], v1.s[1] +; CHECK-SD-NEXT: mov s1, v1.s[1] ; CHECK-SD-NEXT: str h1, [sp, #14] ; CHECK-SD-NEXT: ldr w0, [sp, #12] ; CHECK-SD-NEXT: add sp, sp, #16 diff --git a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll index 6c7f458e345ca..dc39ad0571b14 100644 --- a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll @@ -201,7 +201,7 @@ define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-SD-NEXT: shl v0.2s, v0.2s, #24 ; CHECK-SD-NEXT: sqsub v0.2s, v0.2s, v1.2s ; CHECK-SD-NEXT: ushr v0.2s, v0.2s, #24 -; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] +; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str b0, [x2] ; CHECK-SD-NEXT: stur b1, [x2, #1] ; CHECK-SD-NEXT: ret @@ -256,7 +256,7 @@ define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-SD-NEXT: shl v0.2s, v0.2s, #16 ; CHECK-SD-NEXT: sqsub v0.2s, v0.2s, v1.2s ; CHECK-SD-NEXT: ushr v0.2s, v0.2s, #16 -; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] +; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str h0, [x2] ; CHECK-SD-NEXT: str h1, [x2, #2] ; CHECK-SD-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/store.ll b/llvm/test/CodeGen/AArch64/store.ll index bc4341b8058ef..3a9f12b838702 100644 --- a/llvm/test/CodeGen/AArch64/store.ll +++ b/llvm/test/CodeGen/AArch64/store.ll @@ -110,7 +110,7 @@ define void @store_v2i8(<2 x i8> %a, ptr %ptr){ ; CHECK-SD-LABEL: store_v2i8: ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] +; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str b0, [x0] ; CHECK-SD-NEXT: stur b1, [x0, #1] ; CHECK-SD-NEXT: ret @@ -146,21 +146,13 @@ define void @store_v32i8(<32 x i8> %a, ptr %ptr){ } define void @store_v2i16(<2 x i16> %a, ptr %ptr){ -; CHECK-SD-LABEL: store_v2i16: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] -; CHECK-SD-NEXT: str h0, [x0] -; CHECK-SD-NEXT: str h1, [x0, #2] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: store_v2i16: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: mov s1, v0.s[1] -; CHECK-GI-NEXT: str h0, [x0] -; CHECK-GI-NEXT: str h1, [x0, #2] -; CHECK-GI-NEXT: ret +; CHECK-LABEL: store_v2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov s1, v0.s[1] +; CHECK-NEXT: str h0, [x0] +; CHECK-NEXT: str h1, [x0, #2] +; CHECK-NEXT: ret store <2 x i16> %a, ptr %ptr ret void } @@ -239,8 +231,8 @@ define void @store_v7i8(<7 x i8> %a, ptr %ptr){ ; CHECK-SD-LABEL: store_v7i8: ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-SD-NEXT: mov v1.b[0], v0.b[6] -; CHECK-SD-NEXT: mov v2.h[0], v0.h[2] +; CHECK-SD-NEXT: mov b1, v0.b[6] +; CHECK-SD-NEXT: mov h2, v0.h[2] ; CHECK-SD-NEXT: str s0, [x0] ; CHECK-SD-NEXT: stur b1, [x0, #6] ; CHECK-SD-NEXT: str h2, [x0, #4] @@ -271,7 +263,7 @@ define void @store_v3i16(<3 x i16> %a, ptr %ptr){ ; CHECK-SD-LABEL: store_v3i16: ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-SD-NEXT: mov v1.h[0], v0.h[2] +; CHECK-SD-NEXT: mov h1, v0.h[2] ; CHECK-SD-NEXT: str s0, [x0] ; CHECK-SD-NEXT: str h1, [x0, #4] ; CHECK-SD-NEXT: ret @@ -292,8 +284,8 @@ define void @store_v3i16(<3 x i16> %a, ptr %ptr){ define void @store_v7i16(<7 x i16> %a, ptr %ptr){ ; CHECK-SD-LABEL: store_v7i16: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: mov v1.h[0], v0.h[6] -; CHECK-SD-NEXT: mov v2.s[0], v0.s[2] +; CHECK-SD-NEXT: mov h1, v0.h[6] +; CHECK-SD-NEXT: mov s2, v0.s[2] ; CHECK-SD-NEXT: str d0, [x0] ; CHECK-SD-NEXT: str h1, [x0, #12] ; CHECK-SD-NEXT: str s2, [x0, #8] @@ -322,7 +314,7 @@ define void @store_v7i16(<7 x i16> %a, ptr %ptr){ define void @store_v3i32(<3 x i32> %a, ptr %ptr){ ; CHECK-SD-LABEL: store_v3i32: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: mov v1.s[0], v0.s[2] +; CHECK-SD-NEXT: mov s1, v0.s[2] ; CHECK-SD-NEXT: str d0, [x0] ; CHECK-SD-NEXT: str s1, [x0, #8] ; CHECK-SD-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sub.ll b/llvm/test/CodeGen/AArch64/sub.ll index f482668be311a..7a436eddb23a6 100644 --- a/llvm/test/CodeGen/AArch64/sub.ll +++ b/llvm/test/CodeGen/AArch64/sub.ll @@ -63,7 +63,7 @@ define void @v2i8(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8] ; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9] ; CHECK-SD-NEXT: sub v0.2s, v0.2s, v1.2s -; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] +; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str b0, [x0] ; CHECK-SD-NEXT: stur b1, [x0, #1] ; CHECK-SD-NEXT: ret @@ -100,7 +100,7 @@ define void @v3i8(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v0.8b ; CHECK-SD-NEXT: sub v0.4h, v0.4h, v1.4h ; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b -; CHECK-SD-NEXT: mov v0.h[0], v0.h[2] +; CHECK-SD-NEXT: mov h0, v0.h[2] ; CHECK-SD-NEXT: str s1, [sp, #12] ; CHECK-SD-NEXT: ldrh w8, [sp, #12] ; CHECK-SD-NEXT: stur b0, [x0, #2] @@ -231,7 +231,7 @@ define void @v2i16(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8] ; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9] ; CHECK-SD-NEXT: sub v0.2s, v0.2s, v1.2s -; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] +; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str h0, [x0] ; CHECK-SD-NEXT: str h1, [x0, #2] ; CHECK-SD-NEXT: ret @@ -263,7 +263,7 @@ define void @v3i16(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: ldr d0, [x0] ; CHECK-SD-NEXT: ldr d1, [x1] ; CHECK-SD-NEXT: sub v0.4h, v0.4h, v1.4h -; CHECK-SD-NEXT: mov v1.h[0], v0.h[2] +; CHECK-SD-NEXT: mov h1, v0.h[2] ; CHECK-SD-NEXT: str s0, [x0] ; CHECK-SD-NEXT: str h1, [x0, #4] ; CHECK-SD-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll index e8ab228db4279..0d0b5cbc776c4 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll @@ -380,32 +380,32 @@ define void @test_revv8i16v8i16(ptr %a, ptr %b, ptr %c) #1 { ; CHECK-NEXT: ldr q5, [x0] ; CHECK-NEXT: mov x8, sp ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: mov v1.h[0], v0.h[4] -; CHECK-NEXT: mov v2.h[0], v0.h[5] -; CHECK-NEXT: mov v3.h[0], v0.h[6] -; CHECK-NEXT: mov v4.h[0], v0.h[7] +; CHECK-NEXT: mov h1, v0.h[4] +; CHECK-NEXT: mov h2, v0.h[5] +; CHECK-NEXT: mov h3, v0.h[6] +; CHECK-NEXT: mov h4, v0.h[7] ; CHECK-NEXT: str h0, [sp, #22] ; CHECK-NEXT: st1 { v5.h }[3], [x8] ; CHECK-NEXT: str h5, [sp, #6] ; CHECK-NEXT: str h1, [sp, #30] -; CHECK-NEXT: mov v1.h[0], v0.h[1] +; CHECK-NEXT: mov h1, v0.h[1] ; CHECK-NEXT: str h2, [sp, #28] -; CHECK-NEXT: mov v2.h[0], v0.h[2] -; CHECK-NEXT: mov v0.h[0], v0.h[3] +; CHECK-NEXT: mov h2, v0.h[2] +; CHECK-NEXT: mov h0, v0.h[3] ; CHECK-NEXT: str h3, [sp, #26] -; CHECK-NEXT: mov v3.h[0], v5.h[2] +; CHECK-NEXT: mov h3, v5.h[2] ; CHECK-NEXT: str h4, [sp, #24] ; CHECK-NEXT: str h1, [sp, #20] -; CHECK-NEXT: mov v1.h[0], v5.h[4] +; CHECK-NEXT: mov h1, v5.h[4] ; CHECK-NEXT: str h2, [sp, #18] -; CHECK-NEXT: mov v2.h[0], v5.h[5] +; CHECK-NEXT: mov h2, v5.h[5] ; CHECK-NEXT: str h0, [sp, #16] -; CHECK-NEXT: mov v0.h[0], v5.h[6] +; CHECK-NEXT: mov h0, v5.h[6] ; CHECK-NEXT: str h3, [sp, #2] ; CHECK-NEXT: str h1, [sp, #14] -; CHECK-NEXT: mov v1.h[0], v5.h[7] +; CHECK-NEXT: mov h1, v5.h[7] ; CHECK-NEXT: str h2, [sp, #12] -; CHECK-NEXT: mov v2.h[0], v5.h[1] +; CHECK-NEXT: mov h2, v5.h[1] ; CHECK-NEXT: str h0, [sp, #10] ; CHECK-NEXT: str h1, [sp, #8] ; CHECK-NEXT: str h2, [sp, #4] diff --git a/llvm/test/CodeGen/AArch64/tbl-loops.ll b/llvm/test/CodeGen/AArch64/tbl-loops.ll index 8f5359bcaa044..b5d64112db727 100644 --- a/llvm/test/CodeGen/AArch64/tbl-loops.ll +++ b/llvm/test/CodeGen/AArch64/tbl-loops.ll @@ -366,7 +366,7 @@ define void @loop3(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n ; CHECK-NEXT: xtn v6.4h, v4.4s ; CHECK-NEXT: xtn v7.4h, v2.4s ; CHECK-NEXT: tbl v2.16b, { v5.16b, v6.16b, v7.16b }, v1.16b -; CHECK-NEXT: mov v3.s[0], v2.s[2] +; CHECK-NEXT: mov s3, v2.s[2] ; CHECK-NEXT: str d2, [x0] ; CHECK-NEXT: str s3, [x0, #8] ; CHECK-NEXT: add x0, x0, #12 diff --git a/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll b/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll index a2ebc416e042f..fd23f3da18cd7 100644 --- a/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll +++ b/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll @@ -718,8 +718,8 @@ define void @trunc_v11i64_to_v11i8_in_loop(ptr %A, ptr %dst) { ; CHECK-NEXT: uzp1.8h v1, v2, v1 ; CHECK-NEXT: uzp1.8b v2, v0, v0 ; CHECK-NEXT: uzp1.16b v0, v1, v0 -; CHECK-NEXT: mov.b v1[0], v2[2] -; CHECK-NEXT: mov.h v2[0], v0[4] +; CHECK-NEXT: mov b1, v2[2] +; CHECK-NEXT: mov h2, v0[4] ; CHECK-NEXT: str d0, [x1] ; CHECK-NEXT: stur b1, [x1, #10] ; CHECK-NEXT: str h2, [x1, #8] @@ -753,11 +753,11 @@ define void @trunc_v11i64_to_v11i8_in_loop(ptr %A, ptr %dst) { ; CHECK-BE-NEXT: uzp1 v1.16b, v1.16b, v0.16b ; CHECK-BE-NEXT: uzp1 v0.8b, v0.8b, v0.8b ; CHECK-BE-NEXT: rev16 v2.16b, v1.16b -; CHECK-BE-NEXT: mov v0.b[0], v0.b[2] ; CHECK-BE-NEXT: rev64 v1.16b, v1.16b -; CHECK-BE-NEXT: mov v2.h[0], v2.h[4] -; CHECK-BE-NEXT: stur b0, [x1, #10] +; CHECK-BE-NEXT: mov b0, v0.b[2] +; CHECK-BE-NEXT: mov h2, v2.h[4] ; CHECK-BE-NEXT: str d1, [x1] +; CHECK-BE-NEXT: stur b0, [x1, #10] ; CHECK-BE-NEXT: str h2, [x1, #8] ; CHECK-BE-NEXT: add x1, x1, #16 ; CHECK-BE-NEXT: b.eq .LBB6_1 @@ -789,11 +789,11 @@ define void @trunc_v11i64_to_v11i8_in_loop(ptr %A, ptr %dst) { ; CHECK-DISABLE-NEXT: uzp1 v1.16b, v1.16b, v0.16b ; CHECK-DISABLE-NEXT: uzp1 v0.8b, v0.8b, v0.8b ; CHECK-DISABLE-NEXT: rev16 v2.16b, v1.16b -; CHECK-DISABLE-NEXT: mov v0.b[0], v0.b[2] ; CHECK-DISABLE-NEXT: rev64 v1.16b, v1.16b -; CHECK-DISABLE-NEXT: mov v2.h[0], v2.h[4] -; CHECK-DISABLE-NEXT: stur b0, [x1, #10] +; CHECK-DISABLE-NEXT: mov b0, v0.b[2] +; CHECK-DISABLE-NEXT: mov h2, v2.h[4] ; CHECK-DISABLE-NEXT: str d1, [x1] +; CHECK-DISABLE-NEXT: stur b0, [x1, #10] ; CHECK-DISABLE-NEXT: str h2, [x1, #8] ; CHECK-DISABLE-NEXT: add x1, x1, #16 ; CHECK-DISABLE-NEXT: b.eq .LBB6_1 diff --git a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll index 4e06e062e0c7d..14a578fa317d0 100644 --- a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll @@ -198,7 +198,7 @@ define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-SD-NEXT: mov v1.s[1], w11 ; CHECK-SD-NEXT: add v0.2s, v0.2s, v1.2s ; CHECK-SD-NEXT: umin v0.2s, v0.2s, v2.2s -; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] +; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str b0, [x2] ; CHECK-SD-NEXT: stur b1, [x2, #1] ; CHECK-SD-NEXT: ret @@ -254,7 +254,7 @@ define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-SD-NEXT: mov v1.s[1], w11 ; CHECK-SD-NEXT: add v0.2s, v0.2s, v1.2s ; CHECK-SD-NEXT: umin v0.2s, v0.2s, v2.2s -; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] +; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str h0, [x2] ; CHECK-SD-NEXT: str h1, [x2, #2] ; CHECK-SD-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll index fc6c756f1b17e..ddb3332abf5d0 100644 --- a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll @@ -197,7 +197,7 @@ define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-SD-NEXT: mov v0.s[1], w10 ; CHECK-SD-NEXT: mov v1.s[1], w11 ; CHECK-SD-NEXT: uqsub v0.2s, v0.2s, v1.2s -; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] +; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str b0, [x2] ; CHECK-SD-NEXT: stur b1, [x2, #1] ; CHECK-SD-NEXT: ret @@ -251,7 +251,7 @@ define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-SD-NEXT: mov v0.s[1], w10 ; CHECK-SD-NEXT: mov v1.s[1], w11 ; CHECK-SD-NEXT: uqsub v0.2s, v0.2s, v1.2s -; CHECK-SD-NEXT: mov v1.s[0], v0.s[1] +; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str h0, [x2] ; CHECK-SD-NEXT: str h1, [x2, #2] ; CHECK-SD-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll index 1529a17b2a70d..7d3f5bc270d6b 100644 --- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll +++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll @@ -360,7 +360,7 @@ define void @store_trunc_from_64bits(ptr %src, ptr %dst) { ; CHECK-NEXT: ldr w8, [x0] ; CHECK-NEXT: lsr w9, w8, #16 ; CHECK-NEXT: strb w8, [x1] -; CHECK-NEXT: mov.b v0[0], v0[4] +; CHECK-NEXT: mov b0, v0[4] ; CHECK-NEXT: strb w9, [x1, #1] ; CHECK-NEXT: stur b0, [x1, #2] ; CHECK-NEXT: ret @@ -399,8 +399,8 @@ define void @store_trunc_add_from_64bits(ptr %src, ptr %dst) { ; CHECK-NEXT: ldr d1, [x8, lCPI11_0@PAGEOFF] ; CHECK-NEXT: ld1.h { v0 }[2], [x9] ; CHECK-NEXT: add.4h v0, v0, v1 -; CHECK-NEXT: mov.b v1[0], v0[2] -; CHECK-NEXT: mov.b v2[0], v0[4] +; CHECK-NEXT: mov b1, v0[2] +; CHECK-NEXT: mov b2, v0[4] ; CHECK-NEXT: str b0, [x1] ; CHECK-NEXT: stur b1, [x1, #1] ; CHECK-NEXT: stur b2, [x1, #2] @@ -420,7 +420,7 @@ define void @store_trunc_add_from_64bits(ptr %src, ptr %dst) { ; BE-NEXT: ld1 { v1.4h }, [x8] ; BE-NEXT: add v0.4h, v0.4h, v1.4h ; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b -; BE-NEXT: mov v0.h[0], v0.h[2] +; BE-NEXT: mov h0, v0.h[2] ; BE-NEXT: rev32 v1.16b, v1.16b ; BE-NEXT: str s1, [sp, #12] ; BE-NEXT: ldrh w8, [sp, #12] @@ -445,7 +445,7 @@ define void @load_ext_to_64bits(ptr %src, ptr %dst) { ; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: zip1.8b v0, v0, v0 ; CHECK-NEXT: bic.4h v0, #255, lsl #8 -; CHECK-NEXT: mov.h v1[0], v0[2] +; CHECK-NEXT: mov h1, v0[2] ; CHECK-NEXT: str s0, [x1] ; CHECK-NEXT: str h1, [x1, #4] ; CHECK-NEXT: ret @@ -462,10 +462,10 @@ define void @load_ext_to_64bits(ptr %src, ptr %dst) { ; BE-NEXT: ushll v0.8h, v0.8b, #0 ; BE-NEXT: ld1 { v0.b }[4], [x8] ; BE-NEXT: bic v0.4h, #255, lsl #8 -; BE-NEXT: mov v1.h[0], v0.h[2] -; BE-NEXT: rev32 v0.8h, v0.8h -; BE-NEXT: str h1, [x1, #4] -; BE-NEXT: str s0, [x1] +; BE-NEXT: rev32 v1.8h, v0.8h +; BE-NEXT: mov h0, v0.h[2] +; BE-NEXT: str s1, [x1] +; BE-NEXT: str h0, [x1, #4] ; BE-NEXT: add sp, sp, #16 ; BE-NEXT: ret entry: @@ -481,7 +481,7 @@ define void @load_ext_to_64bits_default_align(ptr %src, ptr %dst) { ; CHECK-NEXT: ldr s0, [x0] ; CHECK-NEXT: zip1.8b v0, v0, v0 ; CHECK-NEXT: bic.4h v0, #255, lsl #8 -; CHECK-NEXT: mov.h v1[0], v0[2] +; CHECK-NEXT: mov h1, v0[2] ; CHECK-NEXT: str s0, [x1] ; CHECK-NEXT: str h1, [x1, #4] ; CHECK-NEXT: ret @@ -492,10 +492,10 @@ define void @load_ext_to_64bits_default_align(ptr %src, ptr %dst) { ; BE-NEXT: rev32 v0.8b, v0.8b ; BE-NEXT: zip1 v0.8b, v0.8b, v0.8b ; BE-NEXT: bic v0.4h, #255, lsl #8 -; BE-NEXT: mov v1.h[0], v0.h[2] -; BE-NEXT: rev32 v0.8h, v0.8h -; BE-NEXT: str h1, [x1, #4] -; BE-NEXT: str s0, [x1] +; BE-NEXT: rev32 v1.8h, v0.8h +; BE-NEXT: mov h0, v0.h[2] +; BE-NEXT: str s1, [x1] +; BE-NEXT: str h0, [x1, #4] ; BE-NEXT: ret entry: %l = load <3 x i8>, ptr %src @@ -510,7 +510,7 @@ define void @load_ext_to_64bits_align_4(ptr %src, ptr %dst) { ; CHECK-NEXT: ldr s0, [x0] ; CHECK-NEXT: zip1.8b v0, v0, v0 ; CHECK-NEXT: bic.4h v0, #255, lsl #8 -; CHECK-NEXT: mov.h v1[0], v0[2] +; CHECK-NEXT: mov h1, v0[2] ; CHECK-NEXT: str s0, [x1] ; CHECK-NEXT: str h1, [x1, #4] ; CHECK-NEXT: ret @@ -521,10 +521,10 @@ define void @load_ext_to_64bits_align_4(ptr %src, ptr %dst) { ; BE-NEXT: rev32 v0.8b, v0.8b ; BE-NEXT: zip1 v0.8b, v0.8b, v0.8b ; BE-NEXT: bic v0.4h, #255, lsl #8 -; BE-NEXT: mov v1.h[0], v0.h[2] -; BE-NEXT: rev32 v0.8h, v0.8h -; BE-NEXT: str h1, [x1, #4] -; BE-NEXT: str s0, [x1] +; BE-NEXT: rev32 v1.8h, v0.8h +; BE-NEXT: mov h0, v0.h[2] +; BE-NEXT: str s1, [x1] +; BE-NEXT: str h0, [x1, #4] ; BE-NEXT: ret entry: %l = load <3 x i8>, ptr %src, align 4 @@ -547,7 +547,7 @@ define void @load_ext_add_to_64bits(ptr %src, ptr %dst) { ; CHECK-NEXT: zip1.8b v0, v0, v0 ; CHECK-NEXT: bic.4h v0, #255, lsl #8 ; CHECK-NEXT: add.4h v0, v0, v1 -; CHECK-NEXT: mov.h v1[0], v0[2] +; CHECK-NEXT: mov h1, v0[2] ; CHECK-NEXT: str s0, [x1] ; CHECK-NEXT: str h1, [x1, #4] ; CHECK-NEXT: ret @@ -569,10 +569,10 @@ define void @load_ext_add_to_64bits(ptr %src, ptr %dst) { ; BE-NEXT: ld1 { v1.4h }, [x8] ; BE-NEXT: bic v0.4h, #255, lsl #8 ; BE-NEXT: add v0.4h, v0.4h, v1.4h -; BE-NEXT: mov v1.h[0], v0.h[2] -; BE-NEXT: rev32 v0.8h, v0.8h -; BE-NEXT: str h1, [x1, #4] -; BE-NEXT: str s0, [x1] +; BE-NEXT: rev32 v1.8h, v0.8h +; BE-NEXT: mov h0, v0.h[2] +; BE-NEXT: str s1, [x1] +; BE-NEXT: str h0, [x1, #4] ; BE-NEXT: add sp, sp, #16 ; BE-NEXT: ret entry: @@ -588,8 +588,8 @@ define void @shift_trunc_store(ptr %src, ptr %dst) { ; CHECK: ; %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ushr.4s v0, v0, #16 -; CHECK-NEXT: mov.b v1[0], v0[4] -; CHECK-NEXT: mov.b v2[0], v0[8] +; CHECK-NEXT: mov b1, v0[4] +; CHECK-NEXT: mov b2, v0[8] ; CHECK-NEXT: str b0, [x1] ; CHECK-NEXT: stur b1, [x1, #1] ; CHECK-NEXT: stur b2, [x1, #2] @@ -602,7 +602,7 @@ define void @shift_trunc_store(ptr %src, ptr %dst) { ; BE-NEXT: ld1 { v0.4s }, [x0] ; BE-NEXT: shrn v0.4h, v0.4s, #16 ; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b -; BE-NEXT: mov v0.h[0], v0.h[2] +; BE-NEXT: mov h0, v0.h[2] ; BE-NEXT: rev32 v1.16b, v1.16b ; BE-NEXT: str s1, [sp, #12] ; BE-NEXT: ldrh w8, [sp, #12] @@ -622,8 +622,8 @@ define void @shift_trunc_store_default_align(ptr %src, ptr %dst) { ; CHECK: ; %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ushr.4s v0, v0, #16 -; CHECK-NEXT: mov.b v1[0], v0[4] -; CHECK-NEXT: mov.b v2[0], v0[8] +; CHECK-NEXT: mov b1, v0[4] +; CHECK-NEXT: mov b2, v0[8] ; CHECK-NEXT: str b0, [x1] ; CHECK-NEXT: stur b1, [x1, #1] ; CHECK-NEXT: stur b2, [x1, #2] @@ -636,7 +636,7 @@ define void @shift_trunc_store_default_align(ptr %src, ptr %dst) { ; BE-NEXT: ld1 { v0.4s }, [x0] ; BE-NEXT: shrn v0.4h, v0.4s, #16 ; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b -; BE-NEXT: mov v0.h[0], v0.h[2] +; BE-NEXT: mov h0, v0.h[2] ; BE-NEXT: rev32 v1.16b, v1.16b ; BE-NEXT: str s1, [sp, #12] ; BE-NEXT: ldrh w8, [sp, #12] @@ -656,8 +656,8 @@ define void @shift_trunc_store_align_4(ptr %src, ptr %dst) { ; CHECK: ; %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ushr.4s v0, v0, #16 -; CHECK-NEXT: mov.b v1[0], v0[4] -; CHECK-NEXT: mov.b v2[0], v0[8] +; CHECK-NEXT: mov b1, v0[4] +; CHECK-NEXT: mov b2, v0[8] ; CHECK-NEXT: str b0, [x1] ; CHECK-NEXT: stur b1, [x1, #1] ; CHECK-NEXT: stur b2, [x1, #2] @@ -670,7 +670,7 @@ define void @shift_trunc_store_align_4(ptr %src, ptr %dst) { ; BE-NEXT: ld1 { v0.4s }, [x0] ; BE-NEXT: shrn v0.4h, v0.4s, #16 ; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b -; BE-NEXT: mov v0.h[0], v0.h[2] +; BE-NEXT: mov h0, v0.h[2] ; BE-NEXT: rev32 v1.16b, v1.16b ; BE-NEXT: str s1, [sp, #12] ; BE-NEXT: ldrh w8, [sp, #12] @@ -690,8 +690,8 @@ define void @shift_trunc_store_const_offset_1(ptr %src, ptr %dst) { ; CHECK: ; %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ushr.4s v0, v0, #16 -; CHECK-NEXT: mov.b v1[0], v0[4] -; CHECK-NEXT: mov.b v2[0], v0[8] +; CHECK-NEXT: mov b1, v0[4] +; CHECK-NEXT: mov b2, v0[8] ; CHECK-NEXT: stur b0, [x1, #1] ; CHECK-NEXT: stur b1, [x1, #2] ; CHECK-NEXT: stur b2, [x1, #3] @@ -704,7 +704,7 @@ define void @shift_trunc_store_const_offset_1(ptr %src, ptr %dst) { ; BE-NEXT: ld1 { v0.4s }, [x0] ; BE-NEXT: shrn v0.4h, v0.4s, #16 ; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b -; BE-NEXT: mov v0.h[0], v0.h[2] +; BE-NEXT: mov h0, v0.h[2] ; BE-NEXT: rev32 v1.16b, v1.16b ; BE-NEXT: str s1, [sp, #12] ; BE-NEXT: ldrh w8, [sp, #12] @@ -725,8 +725,8 @@ define void @shift_trunc_store_const_offset_3(ptr %src, ptr %dst) { ; CHECK: ; %bb.0: ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ushr.4s v0, v0, #16 -; CHECK-NEXT: mov.b v1[0], v0[4] -; CHECK-NEXT: mov.b v2[0], v0[8] +; CHECK-NEXT: mov b1, v0[4] +; CHECK-NEXT: mov b2, v0[8] ; CHECK-NEXT: stur b0, [x1, #3] ; CHECK-NEXT: stur b1, [x1, #4] ; CHECK-NEXT: stur b2, [x1, #5] @@ -739,7 +739,7 @@ define void @shift_trunc_store_const_offset_3(ptr %src, ptr %dst) { ; BE-NEXT: ld1 { v0.4s }, [x0] ; BE-NEXT: shrn v0.4h, v0.4s, #16 ; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b -; BE-NEXT: mov v0.h[0], v0.h[2] +; BE-NEXT: mov h0, v0.h[2] ; BE-NEXT: rev32 v1.16b, v1.16b ; BE-NEXT: str s1, [sp, #12] ; BE-NEXT: ldrh w8, [sp, #12] @@ -763,7 +763,7 @@ define void @shift_trunc_volatile_store(ptr %src, ptr %dst) { ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: shrn.4h v0, v0, #16 ; CHECK-NEXT: uzp1.8b v1, v0, v0 -; CHECK-NEXT: mov.h v0[0], v0[2] +; CHECK-NEXT: mov h0, v0[2] ; CHECK-NEXT: str s1, [sp, #12] ; CHECK-NEXT: ldrh w8, [sp, #12] ; CHECK-NEXT: stur b0, [x1, #2] @@ -778,7 +778,7 @@ define void @shift_trunc_volatile_store(ptr %src, ptr %dst) { ; BE-NEXT: ld1 { v0.4s }, [x0] ; BE-NEXT: shrn v0.4h, v0.4s, #16 ; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b -; BE-NEXT: mov v0.h[0], v0.h[2] +; BE-NEXT: mov h0, v0.h[2] ; BE-NEXT: rev32 v1.16b, v1.16b ; BE-NEXT: str s1, [sp, #12] ; BE-NEXT: ldrh w8, [sp, #12] @@ -806,8 +806,8 @@ define void @load_v3i8_zext_to_3xi32_add_trunc_store(ptr %src) { ; CHECK-NEXT: fmov s0, w9 ; CHECK-NEXT: zip1.8b v0, v0, v0 ; CHECK-NEXT: uaddw.4s v0, v1, v0 -; CHECK-NEXT: mov.b v1[0], v0[8] -; CHECK-NEXT: mov.b v2[0], v0[4] +; CHECK-NEXT: mov b1, v0[8] +; CHECK-NEXT: mov b2, v0[4] ; CHECK-NEXT: str b0, [x0] ; CHECK-NEXT: stur b1, [x0, #2] ; CHECK-NEXT: stur b2, [x0, #1] @@ -830,7 +830,7 @@ define void @load_v3i8_zext_to_3xi32_add_trunc_store(ptr %src) { ; BE-NEXT: ld1 { v0.b }[4], [x9] ; BE-NEXT: add v0.4h, v0.4h, v1.4h ; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b -; BE-NEXT: mov v0.h[0], v0.h[2] +; BE-NEXT: mov h0, v0.h[2] ; BE-NEXT: rev32 v1.16b, v1.16b ; BE-NEXT: str s1, [sp, #8] ; BE-NEXT: ldrh w8, [sp, #8] @@ -859,8 +859,8 @@ define void @load_v3i8_sext_to_3xi32_add_trunc_store(ptr %src) { ; CHECK-NEXT: fmov s0, w9 ; CHECK-NEXT: zip1.8b v0, v0, v0 ; CHECK-NEXT: uaddw.4s v0, v1, v0 -; CHECK-NEXT: mov.b v1[0], v0[8] -; CHECK-NEXT: mov.b v2[0], v0[4] +; CHECK-NEXT: mov b1, v0[8] +; CHECK-NEXT: mov b2, v0[4] ; CHECK-NEXT: str b0, [x0] ; CHECK-NEXT: stur b1, [x0, #2] ; CHECK-NEXT: stur b2, [x0, #1] @@ -883,7 +883,7 @@ define void @load_v3i8_sext_to_3xi32_add_trunc_store(ptr %src) { ; BE-NEXT: ld1 { v0.b }[4], [x9] ; BE-NEXT: add v0.4h, v0.4h, v1.4h ; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b -; BE-NEXT: mov v0.h[0], v0.h[2] +; BE-NEXT: mov h0, v0.h[2] ; BE-NEXT: rev32 v1.16b, v1.16b ; BE-NEXT: str s1, [sp, #8] ; BE-NEXT: ldrh w8, [sp, #8] diff --git a/llvm/test/CodeGen/AArch64/vec_uaddo.ll b/llvm/test/CodeGen/AArch64/vec_uaddo.ll index f9654fdb41bbc..b29195eed9149 100644 --- a/llvm/test/CodeGen/AArch64/vec_uaddo.ll +++ b/llvm/test/CodeGen/AArch64/vec_uaddo.ll @@ -50,7 +50,7 @@ define <3 x i32> @uaddo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind { ; CHECK-LABEL: uaddo_v3i32: ; CHECK: // %bb.0: ; CHECK-NEXT: add v1.4s, v0.4s, v1.4s -; CHECK-NEXT: mov v2.s[0], v1.s[2] +; CHECK-NEXT: mov s2, v1.s[2] ; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s ; CHECK-NEXT: str d1, [x0] ; CHECK-NEXT: str s2, [x0, #8] diff --git a/llvm/test/CodeGen/AArch64/vec_umulo.ll b/llvm/test/CodeGen/AArch64/vec_umulo.ll index 4f4f1ed8f8ed3..12ea8862a03cd 100644 --- a/llvm/test/CodeGen/AArch64/vec_umulo.ll +++ b/llvm/test/CodeGen/AArch64/vec_umulo.ll @@ -59,7 +59,7 @@ define <3 x i32> @umulo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind { ; CHECK-NEXT: uzp2 v2.4s, v3.4s, v2.4s ; CHECK-NEXT: str d1, [x0] ; CHECK-NEXT: cmtst v0.4s, v2.4s, v2.4s -; CHECK-NEXT: mov v2.s[0], v1.s[2] +; CHECK-NEXT: mov s2, v1.s[2] ; CHECK-NEXT: str s2, [x0, #8] ; CHECK-NEXT: ret %t = call {<3 x i32>, <3 x i1>} @llvm.umul.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1) diff --git a/llvm/test/CodeGen/AArch64/vector-compress.ll b/llvm/test/CodeGen/AArch64/vector-compress.ll index 672b6af11cfc3..a580913d40d95 100644 --- a/llvm/test/CodeGen/AArch64/vector-compress.ll +++ b/llvm/test/CodeGen/AArch64/vector-compress.ll @@ -215,7 +215,7 @@ define <8 x i32> @test_compress_large(<8 x i32> %vec, <8 x i1> %mask) { ; CHECK-NEXT: mov x8, sp ; CHECK-NEXT: umov.b w15, v2[4] ; CHECK-NEXT: str s0, [sp] -; CHECK-NEXT: mov.s v3[0], v0[3] +; CHECK-NEXT: mov s3, v0[3] ; CHECK-NEXT: and x10, x10, #0x1 ; CHECK-NEXT: and x14, x9, #0x1 ; CHECK-NEXT: bfi x11, x9, #2, #1 @@ -238,7 +238,7 @@ define <8 x i32> @test_compress_large(<8 x i32> %vec, <8 x i1> %mask) { ; CHECK-NEXT: and x10, x10, #0x7 ; CHECK-NEXT: str s1, [x8, x12, lsl #2] ; CHECK-NEXT: and x12, x9, #0x7 -; CHECK-NEXT: mov.s v0[0], v1[3] +; CHECK-NEXT: mov s0, v1[3] ; CHECK-NEXT: and w11, w11, #0x1 ; CHECK-NEXT: add x10, x8, x10, lsl #2 ; CHECK-NEXT: add x12, x8, x12, lsl #2 diff --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll index 70e468a2b7586..eb83aa5a13e52 100644 --- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll +++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll @@ -1588,13 +1588,13 @@ define void @zext_v8i8_to_v8i128_in_loop(ptr %src, ptr %dst) { ; CHECK-NEXT: ushll2.2d v3, v0, #0 ; CHECK-NEXT: ushll.2d v0, v0, #0 ; CHECK-NEXT: str d2, [x1, #96] -; CHECK-NEXT: mov.d v2[0], v2[1] +; CHECK-NEXT: mov d2, v2[1] ; CHECK-NEXT: str d1, [x1, #64] -; CHECK-NEXT: mov.d v1[0], v1[1] +; CHECK-NEXT: mov d1, v1[1] ; CHECK-NEXT: str d3, [x1, #32] -; CHECK-NEXT: mov.d v3[0], v3[1] +; CHECK-NEXT: mov d3, v3[1] ; CHECK-NEXT: str d0, [x1] -; CHECK-NEXT: mov.d v0[0], v0[1] +; CHECK-NEXT: mov d0, v0[1] ; CHECK-NEXT: str d2, [x1, #112] ; CHECK-NEXT: str d1, [x1, #80] ; CHECK-NEXT: str d3, [x1, #48] @@ -1629,13 +1629,13 @@ define void @zext_v8i8_to_v8i128_in_loop(ptr %src, ptr %dst) { ; CHECK-BE-NEXT: ushll2 v3.2d, v0.4s, #0 ; CHECK-BE-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-BE-NEXT: str d2, [x1, #104] -; CHECK-BE-NEXT: mov v2.d[0], v2.d[1] +; CHECK-BE-NEXT: mov d2, v2.d[1] ; CHECK-BE-NEXT: str d1, [x1, #72] -; CHECK-BE-NEXT: mov v1.d[0], v1.d[1] +; CHECK-BE-NEXT: mov d1, v1.d[1] ; CHECK-BE-NEXT: str d3, [x1, #40] -; CHECK-BE-NEXT: mov v3.d[0], v3.d[1] +; CHECK-BE-NEXT: mov d3, v3.d[1] ; CHECK-BE-NEXT: str d0, [x1, #8] -; CHECK-BE-NEXT: mov v0.d[0], v0.d[1] +; CHECK-BE-NEXT: mov d0, v0.d[1] ; CHECK-BE-NEXT: str d2, [x1, #120] ; CHECK-BE-NEXT: str d1, [x1, #88] ; CHECK-BE-NEXT: str d3, [x1, #56] @@ -2200,7 +2200,7 @@ define void @zext_v20i8_to_v20i24_in_loop(ptr %src, ptr %dst) { ; CHECK-NEXT: tbl.16b v7, { v4 }, v2 ; CHECK-NEXT: tbl.16b v4, { v4 }, v1 ; CHECK-NEXT: stp q7, q6, [x1, #16] -; CHECK-NEXT: mov.s v6[0], v5[2] +; CHECK-NEXT: mov s6, v5[2] ; CHECK-NEXT: str q4, [x1] ; CHECK-NEXT: str d5, [x1, #48] ; CHECK-NEXT: str s6, [x1, #56] @@ -2248,7 +2248,7 @@ define void @zext_v20i8_to_v20i24_in_loop(ptr %src, ptr %dst) { ; CHECK-BE-NEXT: rev64 v4.16b, v4.16b ; CHECK-BE-NEXT: st1 { v5.16b }, [x1] ; CHECK-BE-NEXT: st1 { v16.16b }, [x9] -; CHECK-BE-NEXT: mov v6.s[0], v7.s[2] +; CHECK-BE-NEXT: mov s6, v7.s[2] ; CHECK-BE-NEXT: str d4, [x1, #48] ; CHECK-BE-NEXT: str s6, [x1, #56] ; CHECK-BE-NEXT: add x1, x1, #64 @@ -2615,7 +2615,7 @@ define void @zext_v23i8_to_v23i48_in_loop(ptr %src, ptr %dst) { ; CHECK-BE-NEXT: rev64 v17.16b, v19.16b ; CHECK-BE-NEXT: add x9, x1, #112 ; CHECK-BE-NEXT: tbl v7.16b, { v7.16b }, v1.16b -; CHECK-BE-NEXT: mov v18.h[0], v18.h[4] +; CHECK-BE-NEXT: mov h18, v18.h[4] ; CHECK-BE-NEXT: st1 { v20.16b }, [x9] ; CHECK-BE-NEXT: add x9, x1, #96 ; CHECK-BE-NEXT: st1 { v16.16b }, [x9] From 2f39546460ba638d6ae3190b217c7b46d300ec93 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Fri, 11 Apr 2025 16:18:57 +0000 Subject: [PATCH 10/12] Remove SVE FIXME --- .../Target/AArch64/AArch64ISelLowering.cpp | 10 +- .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 16 ++ .../CodeGen/AArch64/aarch64-sve-ldst-one.ll | 198 ++++++------------ ...g-mode-fixed-length-permute-zip-uzp-trn.ll | 172 +++++++-------- 4 files changed, 167 insertions(+), 229 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index a9b32e2bb769f..2c2f63854dbed 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -24063,15 +24063,9 @@ static SDValue performSTORECombine(SDNode *N, // Handle extracting from lanes != 0. SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Value.getValueType(), Vector, ExtIdx); - // FIXME: Using a fixed-size vector for the insertion should not be - // necessary, but SVE ISEL is missing some folds to avoid fmovs. SDValue Zero = DAG.getVectorIdxConstant(0, DL); - EVT InsertVectorVT = EVT::getVectorVT( - *DAG.getContext(), ElemVT, - ElementCount::getFixed( - VectorVT.getVectorElementCount().getKnownMinValue())); - ExtVector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, InsertVectorVT, - DAG.getUNDEF(InsertVectorVT), Ext, Zero); + ExtVector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT, + DAG.getUNDEF(VectorVT), Ext, Zero); } EVT FPMemVT = MemVT == MVT::i8 diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index a2f326c994c2f..9566e332a884d 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -3475,6 +3475,22 @@ let Predicates = [HasSVE_or_SME] in { (EXTRACT_SUBREG ZPR:$Zs, dsub)>; } + multiclass sve_insert_extract_elt { + // NOP pattern (needed to avoid pointless DUPs being added by the second pattern). + def : Pat<(VT (vector_insert undef, + (VTScalar (vector_extract VT:$vec, (i64 0))), (i64 0))), + (VT $vec)>; + + def : Pat<(VT (vector_insert undef, + (VTScalar (vector_extract VT:$vec, (i64 IdxTy:$Idx))), (i64 0))), + (DUP ZPR:$vec, IdxTy:$Idx)>; + } + + defm : sve_insert_extract_elt; + defm : sve_insert_extract_elt; + defm : sve_insert_extract_elt; + defm : sve_insert_extract_elt; + multiclass sve_predicated_add { def : Pat<(nxv16i8 (add ZPR:$op, (extend nxv16i1:$pred))), (ADD_ZPmZ_B PPR:$pred, ZPR:$op, (DUP_ZI_B value, 0))>; diff --git a/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll index 6c8fd3c5c6029..eb215898a7ad5 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll @@ -1,22 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 | FileCheck %s --check-prefixes=CHECK,CHECK-NONSTREAMING -; RUN: llc < %s -verify-machineinstrs -mattr=+sme -global-isel=0 -force-streaming | FileCheck %s --check-prefixes=CHECK,STREAMING-COMPAT -; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 -force-streaming-compatible | FileCheck %s --check-prefixes=CHECK,STREAMING-COMPAT +; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 | FileCheck %s +; RUN: llc < %s -verify-machineinstrs -mattr=+sme -global-isel=0 -force-streaming | FileCheck %s +; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 -force-streaming-compatible | FileCheck %s target triple = "aarch64-unknown-linux-gnu" define void @test_str_lane_s32(ptr %a, %b) { -; CHECK-NONSTREAMING-LABEL: test_str_lane_s32: -; CHECK-NONSTREAMING: // %bb.0: // %entry -; CHECK-NONSTREAMING-NEXT: mov s0, v0.s[3] -; CHECK-NONSTREAMING-NEXT: str s0, [x0] -; CHECK-NONSTREAMING-NEXT: ret -; -; STREAMING-COMPAT-LABEL: test_str_lane_s32: -; STREAMING-COMPAT: // %bb.0: // %entry -; STREAMING-COMPAT-NEXT: mov z0.s, z0.s[3] -; STREAMING-COMPAT-NEXT: str s0, [x0] -; STREAMING-COMPAT-NEXT: ret +; CHECK-LABEL: test_str_lane_s32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.s, z0.s[3] +; CHECK-NEXT: str s0, [x0] +; CHECK-NEXT: ret entry: %0 = extractelement %b, i32 3 @@ -37,17 +31,11 @@ entry: } define void @test_str_lane_s64(ptr %a, %b) { -; CHECK-NONSTREAMING-LABEL: test_str_lane_s64: -; CHECK-NONSTREAMING: // %bb.0: // %entry -; CHECK-NONSTREAMING-NEXT: mov d0, v0.d[1] -; CHECK-NONSTREAMING-NEXT: str d0, [x0] -; CHECK-NONSTREAMING-NEXT: ret -; -; STREAMING-COMPAT-LABEL: test_str_lane_s64: -; STREAMING-COMPAT: // %bb.0: // %entry -; STREAMING-COMPAT-NEXT: mov z0.d, z0.d[1] -; STREAMING-COMPAT-NEXT: str d0, [x0] -; STREAMING-COMPAT-NEXT: ret +; CHECK-LABEL: test_str_lane_s64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.d, z0.d[1] +; CHECK-NEXT: str d0, [x0] +; CHECK-NEXT: ret entry: %0 = extractelement %b, i32 1 @@ -118,17 +106,11 @@ entry: } define void @test_str_lane_s8(ptr %a, %b) { -; CHECK-NONSTREAMING-LABEL: test_str_lane_s8: -; CHECK-NONSTREAMING: // %bb.0: // %entry -; CHECK-NONSTREAMING-NEXT: mov b0, v0.b[7] -; CHECK-NONSTREAMING-NEXT: str b0, [x0] -; CHECK-NONSTREAMING-NEXT: ret -; -; STREAMING-COMPAT-LABEL: test_str_lane_s8: -; STREAMING-COMPAT: // %bb.0: // %entry -; STREAMING-COMPAT-NEXT: mov z0.b, z0.b[7] -; STREAMING-COMPAT-NEXT: str b0, [x0] -; STREAMING-COMPAT-NEXT: ret +; CHECK-LABEL: test_str_lane_s8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.b, z0.b[7] +; CHECK-NEXT: str b0, [x0] +; CHECK-NEXT: ret entry: %0 = extractelement %b, i32 7 store i8 %0, ptr %a, align 1 @@ -147,17 +129,11 @@ entry: } define void @test_str_lane_s16(ptr %a, %b) { -; CHECK-NONSTREAMING-LABEL: test_str_lane_s16: -; CHECK-NONSTREAMING: // %bb.0: // %entry -; CHECK-NONSTREAMING-NEXT: mov h0, v0.h[3] -; CHECK-NONSTREAMING-NEXT: str h0, [x0] -; CHECK-NONSTREAMING-NEXT: ret -; -; STREAMING-COMPAT-LABEL: test_str_lane_s16: -; STREAMING-COMPAT: // %bb.0: // %entry -; STREAMING-COMPAT-NEXT: mov z0.h, z0.h[3] -; STREAMING-COMPAT-NEXT: str h0, [x0] -; STREAMING-COMPAT-NEXT: ret +; CHECK-LABEL: test_str_lane_s16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.h, z0.h[3] +; CHECK-NEXT: str h0, [x0] +; CHECK-NEXT: ret entry: %0 = extractelement %b, i32 3 @@ -284,17 +260,11 @@ define void @test_str_reduction_i32_to_i8_negative_offset(ptr %ptr, %b) { -; CHECK-NONSTREAMING-LABEL: test_str_lane_s32_negative_offset: -; CHECK-NONSTREAMING: // %bb.0: // %entry -; CHECK-NONSTREAMING-NEXT: mov s0, v0.s[3] -; CHECK-NONSTREAMING-NEXT: stur s0, [x0, #-32] -; CHECK-NONSTREAMING-NEXT: ret -; -; STREAMING-COMPAT-LABEL: test_str_lane_s32_negative_offset: -; STREAMING-COMPAT: // %bb.0: // %entry -; STREAMING-COMPAT-NEXT: mov z0.s, z0.s[3] -; STREAMING-COMPAT-NEXT: stur s0, [x0, #-32] -; STREAMING-COMPAT-NEXT: ret +; CHECK-LABEL: test_str_lane_s32_negative_offset: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.s, z0.s[3] +; CHECK-NEXT: stur s0, [x0, #-32] +; CHECK-NEXT: ret entry: %0 = extractelement %b, i32 3 @@ -317,17 +287,11 @@ entry: } define void @test_str_lane_s64_negative_offset(ptr %a, %b) { -; CHECK-NONSTREAMING-LABEL: test_str_lane_s64_negative_offset: -; CHECK-NONSTREAMING: // %bb.0: // %entry -; CHECK-NONSTREAMING-NEXT: mov d0, v0.d[1] -; CHECK-NONSTREAMING-NEXT: stur d0, [x0, #-64] -; CHECK-NONSTREAMING-NEXT: ret -; -; STREAMING-COMPAT-LABEL: test_str_lane_s64_negative_offset: -; STREAMING-COMPAT: // %bb.0: // %entry -; STREAMING-COMPAT-NEXT: mov z0.d, z0.d[1] -; STREAMING-COMPAT-NEXT: stur d0, [x0, #-64] -; STREAMING-COMPAT-NEXT: ret +; CHECK-LABEL: test_str_lane_s64_negative_offset: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.d, z0.d[1] +; CHECK-NEXT: stur d0, [x0, #-64] +; CHECK-NEXT: ret entry: %0 = extractelement %b, i32 1 @@ -350,17 +314,11 @@ entry: } define void @test_str_lane_s8_negative_offset(ptr %a, %b) { -; CHECK-NONSTREAMING-LABEL: test_str_lane_s8_negative_offset: -; CHECK-NONSTREAMING: // %bb.0: // %entry -; CHECK-NONSTREAMING-NEXT: mov b0, v0.b[7] -; CHECK-NONSTREAMING-NEXT: stur b0, [x0, #-8] -; CHECK-NONSTREAMING-NEXT: ret -; -; STREAMING-COMPAT-LABEL: test_str_lane_s8_negative_offset: -; STREAMING-COMPAT: // %bb.0: // %entry -; STREAMING-COMPAT-NEXT: mov z0.b, z0.b[7] -; STREAMING-COMPAT-NEXT: stur b0, [x0, #-8] -; STREAMING-COMPAT-NEXT: ret +; CHECK-LABEL: test_str_lane_s8_negative_offset: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.b, z0.b[7] +; CHECK-NEXT: stur b0, [x0, #-8] +; CHECK-NEXT: ret entry: %0 = extractelement %b, i32 7 %out_ptr = getelementptr inbounds i8, ptr %a, i64 -8 @@ -381,17 +339,11 @@ entry: } define void @test_str_lane_s16_negative_offset(ptr %a, %b) { -; CHECK-NONSTREAMING-LABEL: test_str_lane_s16_negative_offset: -; CHECK-NONSTREAMING: // %bb.0: // %entry -; CHECK-NONSTREAMING-NEXT: mov h0, v0.h[3] -; CHECK-NONSTREAMING-NEXT: stur h0, [x0, #-16] -; CHECK-NONSTREAMING-NEXT: ret -; -; STREAMING-COMPAT-LABEL: test_str_lane_s16_negative_offset: -; STREAMING-COMPAT: // %bb.0: // %entry -; STREAMING-COMPAT-NEXT: mov z0.h, z0.h[3] -; STREAMING-COMPAT-NEXT: stur h0, [x0, #-16] -; STREAMING-COMPAT-NEXT: ret +; CHECK-LABEL: test_str_lane_s16_negative_offset: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.h, z0.h[3] +; CHECK-NEXT: stur h0, [x0, #-16] +; CHECK-NEXT: ret entry: %0 = extractelement %b, i32 3 @@ -414,17 +366,11 @@ entry: } define void @test_str_trunc_lane_s32_to_s16(ptr %a, %b) { -; CHECK-NONSTREAMING-LABEL: test_str_trunc_lane_s32_to_s16: -; CHECK-NONSTREAMING: // %bb.0: // %entry -; CHECK-NONSTREAMING-NEXT: mov s0, v0.s[3] -; CHECK-NONSTREAMING-NEXT: str h0, [x0] -; CHECK-NONSTREAMING-NEXT: ret -; -; STREAMING-COMPAT-LABEL: test_str_trunc_lane_s32_to_s16: -; STREAMING-COMPAT: // %bb.0: // %entry -; STREAMING-COMPAT-NEXT: mov z0.s, z0.s[3] -; STREAMING-COMPAT-NEXT: str h0, [x0] -; STREAMING-COMPAT-NEXT: ret +; CHECK-LABEL: test_str_trunc_lane_s32_to_s16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.s, z0.s[3] +; CHECK-NEXT: str h0, [x0] +; CHECK-NEXT: ret entry: %0 = extractelement %b, i32 3 @@ -448,17 +394,11 @@ entry: define void @test_str_trunc_lane_s32_to_s8(ptr %a, %b) { -; CHECK-NONSTREAMING-LABEL: test_str_trunc_lane_s32_to_s8: -; CHECK-NONSTREAMING: // %bb.0: // %entry -; CHECK-NONSTREAMING-NEXT: mov s0, v0.s[3] -; CHECK-NONSTREAMING-NEXT: str b0, [x0] -; CHECK-NONSTREAMING-NEXT: ret -; -; STREAMING-COMPAT-LABEL: test_str_trunc_lane_s32_to_s8: -; STREAMING-COMPAT: // %bb.0: // %entry -; STREAMING-COMPAT-NEXT: mov z0.s, z0.s[3] -; STREAMING-COMPAT-NEXT: str b0, [x0] -; STREAMING-COMPAT-NEXT: ret +; CHECK-LABEL: test_str_trunc_lane_s32_to_s8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.s, z0.s[3] +; CHECK-NEXT: str b0, [x0] +; CHECK-NEXT: ret entry: %0 = extractelement %b, i32 3 %trunc = trunc i32 %0 to i8 @@ -494,17 +434,11 @@ entry: } define void @test_str_trunc_lane_s32_to_s16_negative_offset(ptr %a, %b) { -; CHECK-NONSTREAMING-LABEL: test_str_trunc_lane_s32_to_s16_negative_offset: -; CHECK-NONSTREAMING: // %bb.0: // %entry -; CHECK-NONSTREAMING-NEXT: mov s0, v0.s[3] -; CHECK-NONSTREAMING-NEXT: stur h0, [x0, #-16] -; CHECK-NONSTREAMING-NEXT: ret -; -; STREAMING-COMPAT-LABEL: test_str_trunc_lane_s32_to_s16_negative_offset: -; STREAMING-COMPAT: // %bb.0: // %entry -; STREAMING-COMPAT-NEXT: mov z0.s, z0.s[3] -; STREAMING-COMPAT-NEXT: stur h0, [x0, #-16] -; STREAMING-COMPAT-NEXT: ret +; CHECK-LABEL: test_str_trunc_lane_s32_to_s16_negative_offset: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.s, z0.s[3] +; CHECK-NEXT: stur h0, [x0, #-16] +; CHECK-NEXT: ret entry: %0 = extractelement %b, i32 3 @@ -529,17 +463,11 @@ entry: } define void @test_str_trunc_lane_s32_to_s8_negative_offset(ptr %a, %b) { -; CHECK-NONSTREAMING-LABEL: test_str_trunc_lane_s32_to_s8_negative_offset: -; CHECK-NONSTREAMING: // %bb.0: // %entry -; CHECK-NONSTREAMING-NEXT: mov s0, v0.s[3] -; CHECK-NONSTREAMING-NEXT: stur b0, [x0, #-8] -; CHECK-NONSTREAMING-NEXT: ret -; -; STREAMING-COMPAT-LABEL: test_str_trunc_lane_s32_to_s8_negative_offset: -; STREAMING-COMPAT: // %bb.0: // %entry -; STREAMING-COMPAT-NEXT: mov z0.s, z0.s[3] -; STREAMING-COMPAT-NEXT: stur b0, [x0, #-8] -; STREAMING-COMPAT-NEXT: ret +; CHECK-LABEL: test_str_trunc_lane_s32_to_s8_negative_offset: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.s, z0.s[3] +; CHECK-NEXT: stur b0, [x0, #-8] +; CHECK-NEXT: ret entry: %0 = extractelement %b, i32 3 %trunc = trunc i32 %0 to i8 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll index a20a330b39bb4..3d9f407c3064c 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll @@ -159,26 +159,26 @@ define void @zip_v32i16(ptr %a, ptr %b) { ; CHECK-NEXT: mov z4.h, z3.h[7] ; CHECK-NEXT: mov z6.h, z3.h[6] ; CHECK-NEXT: mov z16.h, z3.h[5] -; CHECK-NEXT: mov z20.h, z2.h[7] -; CHECK-NEXT: mov z21.h, z1.h[7] ; CHECK-NEXT: mov z18.h, z3.h[4] ; CHECK-NEXT: mov z19.h, z0.h[4] +; CHECK-NEXT: mov z20.h, z2.h[7] +; CHECK-NEXT: mov z21.h, z1.h[7] ; CHECK-NEXT: mov z22.h, z2.h[6] ; CHECK-NEXT: mov z23.h, z1.h[6] ; CHECK-NEXT: zip1 z24.h, z5.h, z4.h ; CHECK-NEXT: zip1 z25.h, z7.h, z6.h -; CHECK-NEXT: zip1 z17.h, z17.h, z16.h +; CHECK-NEXT: zip1 z16.h, z17.h, z16.h ; CHECK-NEXT: ldp q4, q6, [x0, #32] -; CHECK-NEXT: zip1 z16.h, z21.h, z20.h +; CHECK-NEXT: zip1 z17.h, z19.h, z18.h ; CHECK-NEXT: ldp q5, q7, [x1, #32] -; CHECK-NEXT: zip1 z18.h, z19.h, z18.h -; CHECK-NEXT: zip1 z19.s, z25.s, z24.s +; CHECK-NEXT: zip1 z18.h, z21.h, z20.h +; CHECK-NEXT: zip1 z21.s, z25.s, z24.s ; CHECK-NEXT: zip1 z22.h, z23.h, z22.h ; CHECK-NEXT: mov z23.h, z2.h[5] -; CHECK-NEXT: mov z21.h, z6.h[7] +; CHECK-NEXT: mov z20.h, z6.h[7] ; CHECK-NEXT: mov z24.h, z1.h[5] ; CHECK-NEXT: mov z25.h, z2.h[4] -; CHECK-NEXT: mov z20.h, z7.h[7] +; CHECK-NEXT: mov z19.h, z7.h[7] ; CHECK-NEXT: mov z26.h, z1.h[4] ; CHECK-NEXT: mov z27.h, z6.h[6] ; CHECK-NEXT: mov z28.h, z7.h[5] @@ -187,8 +187,8 @@ define void @zip_v32i16(ptr %a, ptr %b) { ; CHECK-NEXT: mov z31.h, z6.h[4] ; CHECK-NEXT: mov z8.h, z5.h[7] ; CHECK-NEXT: mov z9.h, z4.h[7] -; CHECK-NEXT: zip1 z20.h, z21.h, z20.h -; CHECK-NEXT: mov z21.h, z7.h[6] +; CHECK-NEXT: zip1 z19.h, z20.h, z19.h +; CHECK-NEXT: mov z20.h, z7.h[6] ; CHECK-NEXT: mov z10.h, z5.h[6] ; CHECK-NEXT: mov z11.h, z4.h[6] ; CHECK-NEXT: mov z12.h, z5.h[5] @@ -196,7 +196,7 @@ define void @zip_v32i16(ptr %a, ptr %b) { ; CHECK-NEXT: mov z14.h, z5.h[4] ; CHECK-NEXT: mov z15.h, z4.h[4] ; CHECK-NEXT: zip1 z23.h, z24.h, z23.h -; CHECK-NEXT: zip1 z21.h, z27.h, z21.h +; CHECK-NEXT: zip1 z20.h, z27.h, z20.h ; CHECK-NEXT: zip1 z27.h, z29.h, z28.h ; CHECK-NEXT: zip1 z28.h, z31.h, z30.h ; CHECK-NEXT: zip1 z24.h, z26.h, z25.h @@ -207,23 +207,23 @@ define void @zip_v32i16(ptr %a, ptr %b) { ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: zip1 z30.h, z15.h, z14.h ; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: zip1 z17.s, z18.s, z17.s -; CHECK-NEXT: zip1 z18.s, z21.s, z20.s -; CHECK-NEXT: zip1 z20.s, z28.s, z27.s -; CHECK-NEXT: zip1 z16.s, z22.s, z16.s -; CHECK-NEXT: zip1 z21.s, z24.s, z23.s +; CHECK-NEXT: zip1 z16.s, z17.s, z16.s +; CHECK-NEXT: zip1 z17.s, z20.s, z19.s +; CHECK-NEXT: zip1 z19.s, z28.s, z27.s +; CHECK-NEXT: zip1 z18.s, z22.s, z18.s +; CHECK-NEXT: zip1 z20.s, z24.s, z23.s ; CHECK-NEXT: zip1 z0.h, z0.h, z3.h ; CHECK-NEXT: zip1 z3.s, z26.s, z25.s ; CHECK-NEXT: zip1 z22.s, z30.s, z29.s ; CHECK-NEXT: zip1 z6.h, z6.h, z7.h -; CHECK-NEXT: zip1 z7.d, z17.d, z19.d -; CHECK-NEXT: zip1 z17.d, z20.d, z18.d +; CHECK-NEXT: zip1 z7.d, z16.d, z21.d +; CHECK-NEXT: zip1 z16.d, z19.d, z17.d ; CHECK-NEXT: zip1 z1.h, z1.h, z2.h ; CHECK-NEXT: zip1 z2.h, z4.h, z5.h -; CHECK-NEXT: zip1 z4.d, z21.d, z16.d +; CHECK-NEXT: zip1 z4.d, z20.d, z18.d ; CHECK-NEXT: zip1 z3.d, z22.d, z3.d ; CHECK-NEXT: add z0.h, z0.h, z6.h -; CHECK-NEXT: add z5.h, z7.h, z17.h +; CHECK-NEXT: add z5.h, z7.h, z16.h ; CHECK-NEXT: add z1.h, z1.h, z2.h ; CHECK-NEXT: add z2.h, z4.h, z3.h ; CHECK-NEXT: stp q0, q5, [x0, #32] @@ -1476,44 +1476,44 @@ define void @uzp_v32i8(ptr %a, ptr %b) #0{ ; CHECK-NEXT: zip1 z20.b, z24.b, z23.b ; CHECK-NEXT: zip1 z21.b, z26.b, z25.b ; CHECK-NEXT: zip1 z22.b, z28.b, z27.b +; CHECK-NEXT: zip1 z23.b, z17.b, z29.b ; CHECK-NEXT: mov z24.b, z2.b[14] ; CHECK-NEXT: mov z25.b, z2.b[12] ; CHECK-NEXT: mov z26.b, z2.b[10] ; CHECK-NEXT: mov z27.b, z2.b[8] -; CHECK-NEXT: zip1 z23.b, z17.b, z29.b ; CHECK-NEXT: zip1 z3.h, z4.h, z3.h ; CHECK-NEXT: zip1 z4.h, z6.h, z5.h ; CHECK-NEXT: zip1 z5.h, z7.h, z18.h ; CHECK-NEXT: zip1 z6.h, z19.h, z16.h ; CHECK-NEXT: zip1 z7.h, z21.h, z20.h +; CHECK-NEXT: zip1 z16.h, z23.h, z22.h ; CHECK-NEXT: zip1 z18.b, z25.b, z24.b ; CHECK-NEXT: zip1 z19.b, z27.b, z26.b ; CHECK-NEXT: mov z20.b, z2.b[6] ; CHECK-NEXT: mov z21.b, z2.b[4] +; CHECK-NEXT: mov z23.b, z17.b[15] +; CHECK-NEXT: mov z24.b, z17.b[13] ; CHECK-NEXT: mov z29.b, z17.b[3] ; CHECK-NEXT: mov z30.b, z17.b[1] ; CHECK-NEXT: mov z31.b, z2.b[15] ; CHECK-NEXT: mov z8.b, z2.b[13] -; CHECK-NEXT: zip1 z16.h, z23.h, z22.h ; CHECK-NEXT: mov z22.b, z2.b[2] -; CHECK-NEXT: mov z23.b, z17.b[15] -; CHECK-NEXT: mov z24.b, z17.b[13] ; CHECK-NEXT: mov z25.b, z17.b[11] ; CHECK-NEXT: mov z26.b, z17.b[9] ; CHECK-NEXT: mov z27.b, z17.b[7] ; CHECK-NEXT: mov z28.b, z17.b[5] ; CHECK-NEXT: zip1 z17.h, z19.h, z18.h -; CHECK-NEXT: zip1 z21.b, z21.b, z20.b -; CHECK-NEXT: zip1 z19.b, z30.b, z29.b -; CHECK-NEXT: zip1 z20.b, z8.b, z31.b +; CHECK-NEXT: zip1 z18.b, z21.b, z20.b +; CHECK-NEXT: zip1 z20.b, z24.b, z23.b +; CHECK-NEXT: zip1 z23.b, z30.b, z29.b +; CHECK-NEXT: zip1 z24.b, z8.b, z31.b ; CHECK-NEXT: mov z29.b, z1.b[15] ; CHECK-NEXT: mov z30.b, z1.b[13] ; CHECK-NEXT: mov z31.b, z1.b[11] ; CHECK-NEXT: mov z8.b, z1.b[9] -; CHECK-NEXT: zip1 z22.b, z2.b, z22.b -; CHECK-NEXT: zip1 z23.b, z24.b, z23.b -; CHECK-NEXT: zip1 z24.b, z26.b, z25.b -; CHECK-NEXT: zip1 z18.b, z28.b, z27.b +; CHECK-NEXT: zip1 z19.b, z2.b, z22.b +; CHECK-NEXT: zip1 z21.b, z26.b, z25.b +; CHECK-NEXT: zip1 z22.b, z28.b, z27.b ; CHECK-NEXT: mov z25.b, z2.b[11] ; CHECK-NEXT: mov z26.b, z2.b[9] ; CHECK-NEXT: mov z27.b, z2.b[7] @@ -1538,25 +1538,25 @@ define void @uzp_v32i8(ptr %a, ptr %b) #0{ ; CHECK-NEXT: zip1 z25.b, z26.b, z25.b ; CHECK-NEXT: zip1 z26.b, z28.b, z27.b ; CHECK-NEXT: zip1 z2.b, z2.b, z8.b -; CHECK-NEXT: zip1 z21.h, z22.h, z21.h -; CHECK-NEXT: zip1 z22.h, z24.h, z23.h -; CHECK-NEXT: zip1 z23.h, z31.h, z29.h +; CHECK-NEXT: zip1 z18.h, z19.h, z18.h +; CHECK-NEXT: zip1 z19.h, z21.h, z20.h +; CHECK-NEXT: zip1 z20.h, z31.h, z29.h ; CHECK-NEXT: zip1 z1.h, z1.h, z9.h ; CHECK-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: zip1 z24.h, z10.h, z11.h +; CHECK-NEXT: zip1 z21.h, z10.h, z11.h ; CHECK-NEXT: ldp d11, d10, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: zip1 z0.h, z30.h, z0.h -; CHECK-NEXT: zip1 z18.h, z19.h, z18.h -; CHECK-NEXT: zip1 z19.h, z25.h, z20.h +; CHECK-NEXT: zip1 z22.h, z23.h, z22.h +; CHECK-NEXT: zip1 z23.h, z25.h, z24.h ; CHECK-NEXT: zip1 z2.h, z2.h, z26.h ; CHECK-NEXT: zip1 z3.s, z4.s, z3.s ; CHECK-NEXT: zip1 z4.s, z6.s, z5.s ; CHECK-NEXT: zip1 z5.s, z16.s, z7.s -; CHECK-NEXT: zip1 z1.s, z1.s, z23.s -; CHECK-NEXT: zip1 z6.s, z21.s, z17.s -; CHECK-NEXT: zip1 z0.s, z0.s, z24.s -; CHECK-NEXT: zip1 z7.s, z18.s, z22.s -; CHECK-NEXT: zip1 z2.s, z2.s, z19.s +; CHECK-NEXT: zip1 z1.s, z1.s, z20.s +; CHECK-NEXT: zip1 z6.s, z18.s, z17.s +; CHECK-NEXT: zip1 z0.s, z0.s, z21.s +; CHECK-NEXT: zip1 z7.s, z22.s, z19.s +; CHECK-NEXT: zip1 z2.s, z2.s, z23.s ; CHECK-NEXT: zip1 z3.d, z4.d, z3.d ; CHECK-NEXT: zip1 z0.d, z0.d, z1.d ; CHECK-NEXT: zip1 z1.d, z6.d, z5.d @@ -1752,67 +1752,67 @@ define void @uzp_v16i16(ptr %a, ptr %b) #0{ ; CHECK-NEXT: str d8, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: .cfi_offset b8, -16 -; CHECK-NEXT: ldp q1, q6, [x0] -; CHECK-NEXT: ldp q0, q2, [x1] -; CHECK-NEXT: mov z3.h, z6.h[6] -; CHECK-NEXT: mov z4.h, z6.h[4] -; CHECK-NEXT: mov z5.h, z6.h[2] +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q5, q6, [x1] +; CHECK-NEXT: mov z2.h, z0.h[6] +; CHECK-NEXT: mov z3.h, z0.h[4] +; CHECK-NEXT: mov z4.h, z0.h[2] ; CHECK-NEXT: mov z7.h, z1.h[6] ; CHECK-NEXT: mov z16.h, z1.h[4] ; CHECK-NEXT: mov z17.h, z1.h[2] -; CHECK-NEXT: mov z18.h, z2.h[6] -; CHECK-NEXT: mov z19.h, z2.h[4] -; CHECK-NEXT: mov z20.h, z2.h[2] -; CHECK-NEXT: mov z21.h, z0.h[6] -; CHECK-NEXT: mov z22.h, z0.h[4] -; CHECK-NEXT: zip1 z3.h, z4.h, z3.h -; CHECK-NEXT: zip1 z4.h, z6.h, z5.h -; CHECK-NEXT: zip1 z5.h, z16.h, z7.h +; CHECK-NEXT: mov z18.h, z6.h[6] +; CHECK-NEXT: mov z19.h, z6.h[4] +; CHECK-NEXT: mov z20.h, z6.h[2] +; CHECK-NEXT: mov z21.h, z5.h[6] +; CHECK-NEXT: mov z22.h, z5.h[4] +; CHECK-NEXT: zip1 z2.h, z3.h, z2.h +; CHECK-NEXT: zip1 z3.h, z0.h, z4.h +; CHECK-NEXT: zip1 z4.h, z16.h, z7.h ; CHECK-NEXT: zip1 z7.h, z1.h, z17.h ; CHECK-NEXT: zip1 z16.h, z19.h, z18.h -; CHECK-NEXT: zip1 z18.h, z2.h, z20.h -; CHECK-NEXT: mov z19.h, z0.h[2] -; CHECK-NEXT: zip1 z17.h, z22.h, z21.h -; CHECK-NEXT: mov z20.h, z6.h[7] -; CHECK-NEXT: mov z21.h, z6.h[5] -; CHECK-NEXT: mov z22.h, z6.h[3] -; CHECK-NEXT: mov z6.h, z6.h[1] +; CHECK-NEXT: zip1 z17.h, z6.h, z20.h +; CHECK-NEXT: mov z19.h, z5.h[2] +; CHECK-NEXT: zip1 z18.h, z22.h, z21.h +; CHECK-NEXT: mov z20.h, z0.h[7] +; CHECK-NEXT: mov z21.h, z0.h[5] +; CHECK-NEXT: mov z22.h, z0.h[3] +; CHECK-NEXT: mov z0.h, z0.h[1] ; CHECK-NEXT: mov z23.h, z1.h[7] ; CHECK-NEXT: mov z24.h, z1.h[5] ; CHECK-NEXT: mov z25.h, z1.h[3] ; CHECK-NEXT: mov z1.h, z1.h[1] -; CHECK-NEXT: mov z26.h, z2.h[7] -; CHECK-NEXT: mov z27.h, z2.h[5] -; CHECK-NEXT: mov z28.h, z2.h[3] -; CHECK-NEXT: mov z2.h, z2.h[1] -; CHECK-NEXT: mov z29.h, z0.h[7] -; CHECK-NEXT: mov z30.h, z0.h[5] -; CHECK-NEXT: mov z31.h, z0.h[3] -; CHECK-NEXT: mov z8.h, z0.h[1] -; CHECK-NEXT: zip1 z0.h, z0.h, z19.h +; CHECK-NEXT: mov z26.h, z6.h[7] +; CHECK-NEXT: mov z27.h, z6.h[5] +; CHECK-NEXT: mov z28.h, z6.h[3] +; CHECK-NEXT: mov z6.h, z6.h[1] +; CHECK-NEXT: mov z29.h, z5.h[7] +; CHECK-NEXT: mov z30.h, z5.h[5] +; CHECK-NEXT: mov z31.h, z5.h[3] +; CHECK-NEXT: mov z8.h, z5.h[1] +; CHECK-NEXT: zip1 z5.h, z5.h, z19.h ; CHECK-NEXT: zip1 z19.h, z21.h, z20.h -; CHECK-NEXT: zip1 z6.h, z6.h, z22.h +; CHECK-NEXT: zip1 z0.h, z0.h, z22.h ; CHECK-NEXT: zip1 z20.h, z24.h, z23.h ; CHECK-NEXT: zip1 z1.h, z1.h, z25.h ; CHECK-NEXT: zip1 z21.h, z27.h, z26.h -; CHECK-NEXT: zip1 z2.h, z2.h, z28.h +; CHECK-NEXT: zip1 z6.h, z6.h, z28.h ; CHECK-NEXT: zip1 z22.h, z30.h, z29.h ; CHECK-NEXT: zip1 z23.h, z8.h, z31.h -; CHECK-NEXT: zip1 z3.s, z4.s, z3.s -; CHECK-NEXT: zip1 z4.s, z7.s, z5.s -; CHECK-NEXT: zip1 z5.s, z18.s, z16.s -; CHECK-NEXT: zip1 z6.s, z6.s, z19.s +; CHECK-NEXT: zip1 z2.s, z3.s, z2.s +; CHECK-NEXT: zip1 z3.s, z7.s, z4.s +; CHECK-NEXT: zip1 z4.s, z17.s, z16.s +; CHECK-NEXT: zip1 z0.s, z0.s, z19.s ; CHECK-NEXT: zip1 z1.s, z1.s, z20.s -; CHECK-NEXT: zip1 z0.s, z0.s, z17.s -; CHECK-NEXT: zip1 z2.s, z2.s, z21.s +; CHECK-NEXT: zip1 z5.s, z5.s, z18.s +; CHECK-NEXT: zip1 z6.s, z6.s, z21.s ; CHECK-NEXT: zip1 z7.s, z23.s, z22.s -; CHECK-NEXT: zip1 z3.d, z4.d, z3.d -; CHECK-NEXT: zip1 z1.d, z1.d, z6.d -; CHECK-NEXT: zip1 z0.d, z0.d, z5.d -; CHECK-NEXT: zip1 z2.d, z7.d, z2.d -; CHECK-NEXT: add z1.h, z3.h, z1.h -; CHECK-NEXT: add z0.h, z0.h, z2.h -; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: zip1 z2.d, z3.d, z2.d +; CHECK-NEXT: zip1 z0.d, z1.d, z0.d +; CHECK-NEXT: zip1 z1.d, z5.d, z4.d +; CHECK-NEXT: zip1 z3.d, z7.d, z6.d +; CHECK-NEXT: add z0.h, z2.h, z0.h +; CHECK-NEXT: add z1.h, z1.h, z3.h +; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret ; From 33346f87f53f7872e776096da539a8b55ae70f0d Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Fri, 11 Apr 2025 16:56:21 +0000 Subject: [PATCH 11/12] Restore zero store behaviour --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 7 +++++++ .../CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll | 9 ++++----- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 2c2f63854dbed..8abb167c9d367 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -24025,6 +24025,13 @@ static SDValue performSTORECombine(SDNode *N, EVT VectorVT = Vector.getValueType(); EVT ElemVT = VectorVT.getVectorElementType(); + // Propagate zero constants (applying this fold may miss optimizations). + if (ISD::isConstantSplatVectorAllZeros(Vector.getNode())) { + SDValue ZeroElt = DAG.getConstant(0, DL, ValueVT); + DAG.ReplaceAllUsesWith(Value, ZeroElt); + return SDValue(); + } + if (!ValueVT.isInteger()) return SDValue(); if (ValueVT != MemVT && !ST->isTruncatingStore()) diff --git a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll index 285221462fd67..91eda8d552397 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll @@ -70,10 +70,10 @@ define void @insert_vec_v23i32_uaddlv_from_v8i16(ptr %0) { ; CHECK-NEXT: movi.2d v0, #0000000000000000 ; CHECK-NEXT: movi.2d v2, #0000000000000000 ; CHECK-NEXT: str wzr, [x0, #88] +; CHECK-NEXT: str xzr, [x0, #80] ; CHECK-NEXT: uaddlv.8h s1, v0 ; CHECK-NEXT: stp q0, q0, [x0, #16] ; CHECK-NEXT: stp q0, q0, [x0, #48] -; CHECK-NEXT: str d0, [x0, #80] ; CHECK-NEXT: mov.s v2[0], v1[0] ; CHECK-NEXT: ucvtf.4s v1, v2 ; CHECK-NEXT: str q1, [x0] @@ -146,13 +146,12 @@ define void @insert_vec_v6i64_uaddlv_from_v4i32(ptr %0) { ; CHECK-LABEL: insert_vec_v6i64_uaddlv_from_v4i32: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: movi.2d v0, #0000000000000000 -; CHECK-NEXT: movi.2d v2, #0000000000000000 +; CHECK-NEXT: str xzr, [x0, #16] ; CHECK-NEXT: uaddlv.4s d1, v0 -; CHECK-NEXT: str d0, [x0, #16] ; CHECK-NEXT: fmov x8, d1 ; CHECK-NEXT: ucvtf s1, x8 -; CHECK-NEXT: mov.s v2[0], v1[0] -; CHECK-NEXT: str q2, [x0] +; CHECK-NEXT: mov.s v0[0], v1[0] +; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret entry: From ab5439a678d7dac21f8763858759d84d5526bdfd Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Wed, 16 Apr 2025 13:35:42 +0000 Subject: [PATCH 12/12] Move check --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 8abb167c9d367..3a060a8f663b6 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -24025,6 +24025,9 @@ static SDValue performSTORECombine(SDNode *N, EVT VectorVT = Vector.getValueType(); EVT ElemVT = VectorVT.getVectorElementType(); + if (!ValueVT.isInteger()) + return SDValue(); + // Propagate zero constants (applying this fold may miss optimizations). if (ISD::isConstantSplatVectorAllZeros(Vector.getNode())) { SDValue ZeroElt = DAG.getConstant(0, DL, ValueVT); @@ -24032,8 +24035,6 @@ static SDValue performSTORECombine(SDNode *N, return SDValue(); } - if (!ValueVT.isInteger()) - return SDValue(); if (ValueVT != MemVT && !ST->isTruncatingStore()) return SDValue();