From 58842de36b7003a08ca49b7320394c0146cbc8d1 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Tue, 4 Mar 2025 16:18:34 +0000 Subject: [PATCH 01/11] [AArch64][SVE] Fold integer lane 0 extract and store to FPR store This helps avoid some pointless fmovs to GPRs, which may be slow in streaming mode. --- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 4 +- .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 32 ++ .../CodeGen/AArch64/aarch64-sve-ldst-one.ll | 328 ++++++++++++++++++ ...plex-deinterleaving-reductions-scalable.ll | 5 +- ...sve-streaming-mode-fixed-length-bitcast.ll | 3 +- ...e-streaming-mode-fixed-length-ext-loads.ll | 32 +- ...-streaming-mode-fixed-length-ld2-alloca.ll | 11 +- ...mode-fixed-length-masked-gather-scatter.ll | 5 +- ...eaming-mode-fixed-length-optimize-ptrue.ll | 3 +- .../sve-streaming-mode-fixed-length-stores.ll | 3 +- 10 files changed, 390 insertions(+), 36 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 6c61e3a613f6f..92a4890372025 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -4579,8 +4579,6 @@ let Predicates = [IsLE] in { (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>; } -} // AddedComplexity = 10 - // unscaled i64 truncating stores def : Pat<(truncstorei32 GPR64:$Rt, (am_unscaled32 GPR64sp:$Rn, simm9:$offset)), (STURWi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>; @@ -4589,6 +4587,8 @@ def : Pat<(truncstorei16 GPR64:$Rt, (am_unscaled16 GPR64sp:$Rn, simm9:$offset)), def : Pat<(truncstorei8 GPR64:$Rt, (am_unscaled8 GPR64sp:$Rn, simm9:$offset)), (STURBBi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>; +} // AddedComplexity = 10 + // Match stores from lane 0 to the appropriate subreg's store. multiclass VecStoreULane0Pat; + // Same as Neon VecStoreLane0Pat but without matching VecListOne128. + multiclass SVEVecStoreLane0Pat { + def : Pat<(storeop (STy (vector_extract VTy:$Vt, (i64 0))), + (UIAddrMode GPR64sp:$Rn, IndexType:$offset)), + (STR (SubRegTy (EXTRACT_SUBREG $Vt, SubRegIdx)), + GPR64sp:$Rn, IndexType:$offset)>; + } + + let AddedComplexity = 19 in { + // Lane 0 truncating stores + // i32 -> i16 + defm : SVEVecStoreLane0Pat; + defm : SVEVecStoreLane0Pat; + // i64 -> i32 + defm : SVEVecStoreLane0Pat; + defm : SVEVecStoreLane0Pat; + // i64 -> i16 + defm : SVEVecStoreLane0Pat; + defm : SVEVecStoreLane0Pat; + // i16 -> i16 (technically a truncate as the extracted type is i32) + defm : SVEVecStoreLane0Pat; + defm : SVEVecStoreLane0Pat; + + // Lane 0 stores + defm : SVEVecStoreLane0Pat; + defm : SVEVecStoreLane0Pat; + } + // Insert subvectors into FP SVE vectors. foreach VT = [nxv4f16, nxv4f32, nxv4bf16] in foreach idx = [0, 2] in diff --git a/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll new file mode 100644 index 0000000000000..22b136ac194cc --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll @@ -0,0 +1,328 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 | FileCheck %s --check-prefixes=CHECK +; RUN: llc < %s -verify-machineinstrs -mattr=+sme -global-isel=0 -force-streaming | FileCheck %s --check-prefixes=STREAMING-COMPAT +; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 -force-streaming-compatible | FileCheck %s --check-prefixes=STREAMING-COMPAT + +target triple = "aarch64-unknown-linux-gnu" + +; TODO: Improve codegen for non-zero extract indices. + +define void @test_str_lane_s32(ptr %a, %b) { +; CHECK-LABEL: test_str_lane_s32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, v0.s[3] +; CHECK-NEXT: str w8, [x0] +; CHECK-NEXT: ret +; +; STREAMING-COMPAT-LABEL: test_str_lane_s32: +; STREAMING-COMPAT: // %bb.0: // %entry +; STREAMING-COMPAT-NEXT: mov z0.s, z0.s[3] +; STREAMING-COMPAT-NEXT: fmov w8, s0 +; STREAMING-COMPAT-NEXT: str w8, [x0] +; STREAMING-COMPAT-NEXT: ret + +entry: + %0 = extractelement %b, i32 3 + store i32 %0, ptr %a, align 4 + ret void +} + +define void @test_str_lane0_s32(ptr %a, %b) { +; CHECK-LABEL: test_str_lane0_s32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str s0, [x0] +; CHECK-NEXT: ret +; +; STREAMING-COMPAT-LABEL: test_str_lane0_s32: +; STREAMING-COMPAT: // %bb.0: // %entry +; STREAMING-COMPAT-NEXT: str s0, [x0] +; STREAMING-COMPAT-NEXT: ret + +entry: + %0 = extractelement %b, i32 0 + store i32 %0, ptr %a, align 4 + ret void +} + +define void @test_str_lane_s64(ptr %a, %b) { +; CHECK-LABEL: test_str_lane_s64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov x8, v0.d[1] +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: ret +; +; STREAMING-COMPAT-LABEL: test_str_lane_s64: +; STREAMING-COMPAT: // %bb.0: // %entry +; STREAMING-COMPAT-NEXT: mov z0.d, z0.d[1] +; STREAMING-COMPAT-NEXT: fmov x8, d0 +; STREAMING-COMPAT-NEXT: str x8, [x0] +; STREAMING-COMPAT-NEXT: ret + +entry: + %0 = extractelement %b, i32 1 + store i64 %0, ptr %a, align 8 + ret void +} + +define void @test_str_lane0_s64(ptr %a, %b) { +; CHECK-LABEL: test_str_lane0_s64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str d0, [x0] +; CHECK-NEXT: ret +; +; STREAMING-COMPAT-LABEL: test_str_lane0_s64: +; STREAMING-COMPAT: // %bb.0: // %entry +; STREAMING-COMPAT-NEXT: str d0, [x0] +; STREAMING-COMPAT-NEXT: ret + +entry: + %0 = extractelement %b, i32 0 + store i64 %0, ptr %a, align 8 + ret void +} + +define void @test_str_lane_f32(ptr %a, %b) { +; CHECK-LABEL: test_str_lane_f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.s, z0.s[3] +; CHECK-NEXT: str s0, [x0] +; CHECK-NEXT: ret +; +; STREAMING-COMPAT-LABEL: test_str_lane_f32: +; STREAMING-COMPAT: // %bb.0: // %entry +; STREAMING-COMPAT-NEXT: mov z0.s, z0.s[3] +; STREAMING-COMPAT-NEXT: str s0, [x0] +; STREAMING-COMPAT-NEXT: ret + +entry: + %0 = extractelement %b, i32 3 + store float %0, ptr %a, align 4 + ret void +} + +define void @test_str_lane0_f32(ptr %a, %b) { +; CHECK-LABEL: test_str_lane0_f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str s0, [x0] +; CHECK-NEXT: ret +; +; STREAMING-COMPAT-LABEL: test_str_lane0_f32: +; STREAMING-COMPAT: // %bb.0: // %entry +; STREAMING-COMPAT-NEXT: str s0, [x0] +; STREAMING-COMPAT-NEXT: ret + +entry: + %0 = extractelement %b, i32 0 + store float %0, ptr %a, align 4 + ret void +} + +define void @test_str_lane_f64(ptr %a, %b) { +; CHECK-LABEL: test_str_lane_f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.d, z0.d[1] +; CHECK-NEXT: str d0, [x0] +; CHECK-NEXT: ret +; +; STREAMING-COMPAT-LABEL: test_str_lane_f64: +; STREAMING-COMPAT: // %bb.0: // %entry +; STREAMING-COMPAT-NEXT: mov z0.d, z0.d[1] +; STREAMING-COMPAT-NEXT: str d0, [x0] +; STREAMING-COMPAT-NEXT: ret + +entry: + %0 = extractelement %b, i32 1 + store double %0, ptr %a, align 8 + ret void +} + +define void @test_str_lane0_f64(ptr %a, %b) { +; CHECK-LABEL: test_str_lane0_f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str d0, [x0] +; CHECK-NEXT: ret +; +; STREAMING-COMPAT-LABEL: test_str_lane0_f64: +; STREAMING-COMPAT: // %bb.0: // %entry +; STREAMING-COMPAT-NEXT: str d0, [x0] +; STREAMING-COMPAT-NEXT: ret + +entry: + %0 = extractelement %b, i32 0 + store double %0, ptr %a, align 8 + ret void +} + +define void @test_str_lane_s8(ptr %a, %b) { +; CHECK-LABEL: test_str_lane_s8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: umov w8, v0.b[7] +; CHECK-NEXT: strb w8, [x0] +; CHECK-NEXT: ret +; +; STREAMING-COMPAT-LABEL: test_str_lane_s8: +; STREAMING-COMPAT: // %bb.0: // %entry +; STREAMING-COMPAT-NEXT: mov z0.b, z0.b[7] +; STREAMING-COMPAT-NEXT: fmov w8, s0 +; STREAMING-COMPAT-NEXT: strb w8, [x0] +; STREAMING-COMPAT-NEXT: ret + +entry: + %0 = extractelement %b, i32 7 + store i8 %0, ptr %a, align 1 + ret void +} + +define void @test_str_lane_s16(ptr %a, %b) { +; CHECK-LABEL: test_str_lane_s16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: umov w8, v0.h[3] +; CHECK-NEXT: strh w8, [x0] +; CHECK-NEXT: ret +; +; STREAMING-COMPAT-LABEL: test_str_lane_s16: +; STREAMING-COMPAT: // %bb.0: // %entry +; STREAMING-COMPAT-NEXT: mov z0.h, z0.h[3] +; STREAMING-COMPAT-NEXT: fmov w8, s0 +; STREAMING-COMPAT-NEXT: strh w8, [x0] +; STREAMING-COMPAT-NEXT: ret + +entry: + %0 = extractelement %b, i32 3 + store i16 %0, ptr %a, align 2 + ret void +} + +define void @test_str_lane0_s16(ptr %a, %b) { +; CHECK-LABEL: test_str_lane0_s16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str h0, [x0] +; CHECK-NEXT: ret +; +; STREAMING-COMPAT-LABEL: test_str_lane0_s16: +; STREAMING-COMPAT: // %bb.0: // %entry +; STREAMING-COMPAT-NEXT: str h0, [x0] +; STREAMING-COMPAT-NEXT: ret + +entry: + %0 = extractelement %b, i32 0 + store i16 %0, ptr %a, align 2 + ret void +} + +define void @test_str_reduction_i32_to_i32(ptr %ptr, %p0, %v) { +; CHECK-LABEL: test_str_reduction_i32_to_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: uaddv d0, p0, z0.s +; CHECK-NEXT: str s0, [x0] +; CHECK-NEXT: ret +; +; STREAMING-COMPAT-LABEL: test_str_reduction_i32_to_i32: +; STREAMING-COMPAT: // %bb.0: +; STREAMING-COMPAT-NEXT: uaddv d0, p0, z0.s +; STREAMING-COMPAT-NEXT: str s0, [x0] +; STREAMING-COMPAT-NEXT: ret + + %reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32( %p0, %v) + %trunc = trunc i64 %reduce to i32 + store i32 %trunc, ptr %ptr, align 4 + ret void +} + +define void @test_str_reduction_i32_to_i64(ptr %ptr, %p0, %v) { +; CHECK-LABEL: test_str_reduction_i32_to_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: uaddv d0, p0, z0.s +; CHECK-NEXT: str d0, [x0] +; CHECK-NEXT: ret +; +; STREAMING-COMPAT-LABEL: test_str_reduction_i32_to_i64: +; STREAMING-COMPAT: // %bb.0: +; STREAMING-COMPAT-NEXT: uaddv d0, p0, z0.s +; STREAMING-COMPAT-NEXT: str d0, [x0] +; STREAMING-COMPAT-NEXT: ret + + %reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32( %p0, %v) + store i64 %reduce, ptr %ptr, align 8 + ret void +} + +define void @test_str_reduction_i32_to_i16(ptr %ptr, %p0, %v) { +; CHECK-LABEL: test_str_reduction_i32_to_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: uaddv d0, p0, z0.s +; CHECK-NEXT: str h0, [x0] +; CHECK-NEXT: ret +; +; STREAMING-COMPAT-LABEL: test_str_reduction_i32_to_i16: +; STREAMING-COMPAT: // %bb.0: +; STREAMING-COMPAT-NEXT: uaddv d0, p0, z0.s +; STREAMING-COMPAT-NEXT: str h0, [x0] +; STREAMING-COMPAT-NEXT: ret + + %reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32( %p0, %v) + %trunc = trunc i64 %reduce to i16 + store i16 %trunc, ptr %ptr, align 2 + ret void +} + +define void @test_str_reduction_i32_to_i32_negative_offset(ptr %ptr, %p0, %v) { +; CHECK-LABEL: test_str_reduction_i32_to_i32_negative_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: uaddv d0, p0, z0.s +; CHECK-NEXT: stur s0, [x0, #-32] +; CHECK-NEXT: ret +; +; STREAMING-COMPAT-LABEL: test_str_reduction_i32_to_i32_negative_offset: +; STREAMING-COMPAT: // %bb.0: +; STREAMING-COMPAT-NEXT: uaddv d0, p0, z0.s +; STREAMING-COMPAT-NEXT: stur s0, [x0, #-32] +; STREAMING-COMPAT-NEXT: ret + + %reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32( %p0, %v) + %trunc = trunc i64 %reduce to i32 + %out_ptr = getelementptr inbounds float, ptr %ptr, i64 -8 + store i32 %trunc, ptr %out_ptr, align 4 + ret void +} + +define void @test_str_reduction_i32_to_i64_negative_offset(ptr %ptr, %p0, %v) { +; CHECK-LABEL: test_str_reduction_i32_to_i64_negative_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: uaddv d0, p0, z0.s +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: stur x8, [x0, #-32] +; CHECK-NEXT: ret +; +; STREAMING-COMPAT-LABEL: test_str_reduction_i32_to_i64_negative_offset: +; STREAMING-COMPAT: // %bb.0: +; STREAMING-COMPAT-NEXT: uaddv d0, p0, z0.s +; STREAMING-COMPAT-NEXT: fmov x8, d0 +; STREAMING-COMPAT-NEXT: stur x8, [x0, #-32] +; STREAMING-COMPAT-NEXT: ret + + %reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32( %p0, %v) + %out_ptr = getelementptr inbounds float, ptr %ptr, i64 -8 + store i64 %reduce, ptr %out_ptr, align 8 + ret void +} + +define void @test_str_reduction_i32_to_i16_negative_offset(ptr %ptr, %p0, %v) { +; CHECK-LABEL: test_str_reduction_i32_to_i16_negative_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: uaddv d0, p0, z0.s +; CHECK-NEXT: stur h0, [x0, #-32] +; CHECK-NEXT: ret +; +; STREAMING-COMPAT-LABEL: test_str_reduction_i32_to_i16_negative_offset: +; STREAMING-COMPAT: // %bb.0: +; STREAMING-COMPAT-NEXT: uaddv d0, p0, z0.s +; STREAMING-COMPAT-NEXT: stur h0, [x0, #-32] +; STREAMING-COMPAT-NEXT: ret + + %reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32( %p0, %v) + %trunc = trunc i64 %reduce to i16 + %out_ptr = getelementptr inbounds float, ptr %ptr, i64 -8 + store i16 %trunc, ptr %out_ptr, align 2 + ret void +} diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll index 668dc18df6a0b..89f790210e193 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll @@ -332,15 +332,14 @@ define dso_local %"class.std::complex" @reduction_mix(ptr %a, ptr %b, ptr noalia ; CHECK-NEXT: add z2.d, z5.d, z2.d ; CHECK-NEXT: b.ne .LBB3_1 ; CHECK-NEXT: // %bb.2: // %middle.block -; CHECK-NEXT: uaddv d2, p0, z2.d ; CHECK-NEXT: uzp2 z3.d, z1.d, z0.d ; CHECK-NEXT: uzp1 z1.d, z1.d, z0.d +; CHECK-NEXT: uaddv d2, p0, z2.d ; CHECK-NEXT: faddv d0, p0, z3.d -; CHECK-NEXT: fmov x8, d2 ; CHECK-NEXT: faddv d1, p0, z1.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: str s2, [x4] ; CHECK-NEXT: // kill: def $d1 killed $d1 killed $z1 -; CHECK-NEXT: str w8, [x4] ; CHECK-NEXT: ret entry: %0 = tail call i64 @llvm.vscale.i64() diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll index 6644be11a02ba..ffef6f74f2d36 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll @@ -95,8 +95,7 @@ define void @bitcast_v2i16(ptr %a, ptr %b) { ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] ; CHECK-NEXT: mov z1.s, z0.s[1] ; CHECK-NEXT: zip1 z0.h, z0.h, z1.h -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: str w8, [x1] +; CHECK-NEXT: str s0, [x1] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: bitcast_v2i16: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll index 7d6336a43a4fd..9e1d342663f0f 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll @@ -268,24 +268,26 @@ define <4 x i256> @load_sext_v4i32i256(ptr %ap) { ; CHECK-NEXT: sunpklo z1.d, z0.s ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: sunpklo z0.d, z0.s +; CHECK-NEXT: mov z2.d, z1.d[1] ; CHECK-NEXT: fmov x9, d1 -; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: str d1, [x8] +; CHECK-NEXT: str d0, [x8, #64] +; CHECK-NEXT: fmov x10, d2 ; CHECK-NEXT: fmov x11, d0 ; CHECK-NEXT: mov z0.d, z0.d[1] -; CHECK-NEXT: asr x10, x9, #63 -; CHECK-NEXT: stp x9, x10, [x8] -; CHECK-NEXT: fmov x9, d1 -; CHECK-NEXT: asr x12, x11, #63 -; CHECK-NEXT: stp x10, x10, [x8, #16] -; CHECK-NEXT: stp x11, x12, [x8, #64] -; CHECK-NEXT: fmov x11, d0 -; CHECK-NEXT: asr x10, x9, #63 -; CHECK-NEXT: stp x12, x12, [x8, #80] -; CHECK-NEXT: stp x10, x10, [x8, #48] -; CHECK-NEXT: asr x12, x11, #63 -; CHECK-NEXT: stp x9, x10, [x8, #32] -; CHECK-NEXT: stp x12, x12, [x8, #112] -; CHECK-NEXT: stp x11, x12, [x8, #96] +; CHECK-NEXT: asr x9, x9, #63 +; CHECK-NEXT: stp x9, x9, [x8, #8] +; CHECK-NEXT: asr x11, x11, #63 +; CHECK-NEXT: stp x9, x10, [x8, #24] +; CHECK-NEXT: asr x9, x10, #63 +; CHECK-NEXT: fmov x10, d0 +; CHECK-NEXT: stp x11, x11, [x8, #72] +; CHECK-NEXT: stp x9, x9, [x8, #48] +; CHECK-NEXT: str x9, [x8, #40] +; CHECK-NEXT: asr x9, x10, #63 +; CHECK-NEXT: stp x11, x10, [x8, #88] +; CHECK-NEXT: stp x9, x9, [x8, #112] +; CHECK-NEXT: str x9, [x8, #104] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: load_sext_v4i32i256: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll index 613543310f2c3..aa1adfd306a4c 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll @@ -75,8 +75,7 @@ define void @alloc_v6i8(ptr %st_ptr) nounwind { ; CHECK-NEXT: ld1h { z1.s }, p1/z, [x8] ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: strb w8, [x19, #2] -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: strh w8, [x19] +; CHECK-NEXT: str h1, [x19] ; CHECK-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret @@ -120,14 +119,12 @@ define void @alloc_v32i8(ptr %st_ptr) nounwind { ; CHECK-NEXT: mov x0, sp ; CHECK-NEXT: bl def ; CHECK-NEXT: adrp x8, .LCPI2_0 -; CHECK-NEXT: ldr q0, [sp] +; CHECK-NEXT: ldp q0, q2, [sp] ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_0] ; CHECK-NEXT: tbl z0.b, { z0.b }, z1.b -; CHECK-NEXT: ldr q1, [sp, #16] -; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: fmov w8, s2 ; CHECK-NEXT: strb w8, [x19, #8] -; CHECK-NEXT: fmov x8, d0 -; CHECK-NEXT: str x8, [x19] +; CHECK-NEXT: str d0, [x19] ; CHECK-NEXT: ldp x30, x19, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #48 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll index c8cea6ebabd48..434e24bf48724 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll @@ -121,9 +121,8 @@ define void @masked_scatter_v2i64(ptr %a, ptr %b) vscale_range(2, 2) { ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB1_3: // %cond.store -; CHECK-NEXT: fmov x9, d0 -; CHECK-NEXT: fmov x10, d1 -; CHECK-NEXT: str x9, [x10] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: str d0, [x9] ; CHECK-NEXT: tbz w8, #1, .LBB1_2 ; CHECK-NEXT: .LBB1_4: // %cond.store1 ; CHECK-NEXT: mov z0.d, z0.d[1] diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll index 431c5a78202e8..74e5fe7352cfd 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll @@ -769,8 +769,7 @@ define void @fadd_v2f16(ptr %a, ptr %b) { ; CHECK-NEXT: ldr s0, [x0] ; CHECK-NEXT: ldr s1, [x1] ; CHECK-NEXT: fadd z0.h, p0/m, z0.h, z1.h -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: str w8, [x0] +; CHECK-NEXT: str s0, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fadd_v2f16: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll index b8779991dbb45..17579d79896da 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll @@ -93,8 +93,7 @@ define void @store_v2f16(ptr %a) { ; CHECK-LABEL: store_v2f16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: str w8, [x0] +; CHECK-NEXT: str s0, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: store_v2f16: From 2b54fe290db9676dd3e96503287da52a3c8d87f5 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Wed, 5 Mar 2025 10:17:01 +0000 Subject: [PATCH 02/11] Add missing folds --- .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 8 +++-- .../CodeGen/AArch64/aarch64-sve-ldst-one.ll | 34 ++++++++++++++----- 2 files changed, 31 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index d61afeccb09d1..49fd743cc65b4 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -2023,8 +2023,12 @@ let Predicates = [HasSVE_or_SME] in { defm : SVEVecStoreLane0Pat; // Lane 0 stores - defm : SVEVecStoreLane0Pat; - defm : SVEVecStoreLane0Pat; + // i32 + defm : SVEVecStoreLane0Pat; + defm : SVEVecStoreLane0Pat; + // i64 + defm : SVEVecStoreLane0Pat; + defm : SVEVecStoreLane0Pat; } // Insert subvectors into FP SVE vectors. diff --git a/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll index 22b136ac194cc..c2bd513634b44 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll @@ -44,6 +44,24 @@ entry: ret void } +define void @test_str_lane0_s32_negative_offset(ptr %a, %b) { +; CHECK-LABEL: test_str_lane0_s32_negative_offset: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stur s0, [x0, #-32] +; CHECK-NEXT: ret +; +; STREAMING-COMPAT-LABEL: test_str_lane0_s32_negative_offset: +; STREAMING-COMPAT: // %bb.0: // %entry +; STREAMING-COMPAT-NEXT: stur s0, [x0, #-32] +; STREAMING-COMPAT-NEXT: ret + +entry: + %0 = extractelement %b, i32 0 + %out_ptr = getelementptr inbounds i32, ptr %a, i64 -8 + store i32 %0, ptr %out_ptr, align 4 + ret void +} + define void @test_str_lane_s64(ptr %a, %b) { ; CHECK-LABEL: test_str_lane_s64: ; CHECK: // %bb.0: // %entry @@ -281,7 +299,7 @@ define void @test_str_reduction_i32_to_i32_negative_offset(ptr %ptr, %p0, %v) %trunc = trunc i64 %reduce to i32 - %out_ptr = getelementptr inbounds float, ptr %ptr, i64 -8 + %out_ptr = getelementptr inbounds i32, ptr %ptr, i64 -8 store i32 %trunc, ptr %out_ptr, align 4 ret void } @@ -290,19 +308,17 @@ define void @test_str_reduction_i32_to_i64_negative_offset(ptr %ptr, %p0, %v) - %out_ptr = getelementptr inbounds float, ptr %ptr, i64 -8 + %out_ptr = getelementptr inbounds i64, ptr %ptr, i64 -8 store i64 %reduce, ptr %out_ptr, align 8 ret void } @@ -311,18 +327,18 @@ define void @test_str_reduction_i32_to_i16_negative_offset(ptr %ptr, %p0, %v) %trunc = trunc i64 %reduce to i16 - %out_ptr = getelementptr inbounds float, ptr %ptr, i64 -8 + %out_ptr = getelementptr inbounds i16, ptr %ptr, i64 -8 store i16 %trunc, ptr %out_ptr, align 2 ret void } From 8a9bc1cb7cdc2b38edb33c42b991a866bfd515f3 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Wed, 5 Mar 2025 13:43:02 +0000 Subject: [PATCH 03/11] Handle a few more cases + more tests --- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 1 + .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 84 +++--- .../CodeGen/AArch64/aarch64-sve-ldst-one.ll | 281 ++++++++++++++++-- ...e-streaming-mode-fixed-length-ext-loads.ll | 28 +- ...mode-fixed-length-masked-gather-scatter.ll | 7 +- 5 files changed, 322 insertions(+), 79 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 92a4890372025..d374c1007dbe7 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -134,6 +134,7 @@ def HasRDM : Predicate<"Subtarget->hasRDM()">, def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">, AssemblerPredicateWithAll<(all_of FeatureFullFP16), "fullfp16">; def HasNoFullFP16 : Predicate<"!Subtarget->hasFullFP16()">; +def HasNoNEON : Predicate<"!Subtarget->isNeonAvailable()">; def HasFP16FML : Predicate<"Subtarget->hasFP16FML()">, AssemblerPredicateWithAll<(all_of FeatureFP16FML), "fp16fml">; def HasSPE : Predicate<"Subtarget->hasSPE()">, diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 49fd743cc65b4..00d1ea3bf6432 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -1827,6 +1827,54 @@ let Predicates = [HasSVE] in { defm : adrXtwShiftPat; } // End HasSVE +multiclass SVEVecStoreLanePat { + let Predicates = [HasSVE_or_SME] in { + // Same as Neon VecStoreLane0Pat but without matching VecListOne128. + def : Pat<(storeop (STy (vector_extract VTy:$Vt, (i64 0))), + (UIAddrMode GPR64sp:$Rn, IndexType:$offset)), + (STR (SubRegTy (EXTRACT_SUBREG $Vt, SubRegIdx)), + GPR64sp:$Rn, IndexType:$offset)>; + } + + // Only used for streaming[-compatible] SVE -- when NEON is available we avoid a DUP. + let Predicates = [HasSVE_or_SME, HasNoNEON] in { + // Non-zero immediate index: + def : Pat<(storeop (STy (vector_extract VTy:$Vt, DUPIdxTy:$idx)), + (UIAddrMode GPR64sp:$Rn, IndexType:$offset)), + (STR (SubRegTy (EXTRACT_SUBREG (DUP $Vt, DUPIdxTy:$idx), SubRegIdx)), + GPR64sp:$Rn, IndexType:$offset)>; + } +} + +let AddedComplexity = 19 in { + // Lane 0 truncating stores + // i32 -> i16 + defm : SVEVecStoreLanePat; + defm : SVEVecStoreLanePat; + // i64 -> i32 + defm : SVEVecStoreLanePat; + defm : SVEVecStoreLanePat; + // i64 -> i16 + defm : SVEVecStoreLanePat; + defm : SVEVecStoreLanePat; + // i16 -> i16 (technically a truncate as the extracted type is i32) + defm : SVEVecStoreLanePat; + defm : SVEVecStoreLanePat; + + // Lane 0 stores + // i32 + defm : SVEVecStoreLanePat; + defm : SVEVecStoreLanePat; + // i64 + defm : SVEVecStoreLanePat; + defm : SVEVecStoreLanePat; +} + let Predicates = [HasSVE_or_SME] in { defm TBL_ZZZ : sve_int_perm_tbl<"tbl", AArch64tbl>; @@ -1995,42 +2043,6 @@ let Predicates = [HasSVE_or_SME] in { def : Pat<(nxv2bf16 (extract_subvector nxv8bf16:$Zs, (i64 6))), (UUNPKHI_ZZ_D (UUNPKHI_ZZ_S ZPR:$Zs))>; - // Same as Neon VecStoreLane0Pat but without matching VecListOne128. - multiclass SVEVecStoreLane0Pat { - def : Pat<(storeop (STy (vector_extract VTy:$Vt, (i64 0))), - (UIAddrMode GPR64sp:$Rn, IndexType:$offset)), - (STR (SubRegTy (EXTRACT_SUBREG $Vt, SubRegIdx)), - GPR64sp:$Rn, IndexType:$offset)>; - } - - let AddedComplexity = 19 in { - // Lane 0 truncating stores - // i32 -> i16 - defm : SVEVecStoreLane0Pat; - defm : SVEVecStoreLane0Pat; - // i64 -> i32 - defm : SVEVecStoreLane0Pat; - defm : SVEVecStoreLane0Pat; - // i64 -> i16 - defm : SVEVecStoreLane0Pat; - defm : SVEVecStoreLane0Pat; - // i16 -> i16 (technically a truncate as the extracted type is i32) - defm : SVEVecStoreLane0Pat; - defm : SVEVecStoreLane0Pat; - - // Lane 0 stores - // i32 - defm : SVEVecStoreLane0Pat; - defm : SVEVecStoreLane0Pat; - // i64 - defm : SVEVecStoreLane0Pat; - defm : SVEVecStoreLane0Pat; - } - // Insert subvectors into FP SVE vectors. foreach VT = [nxv4f16, nxv4f32, nxv4bf16] in foreach idx = [0, 2] in diff --git a/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll index c2bd513634b44..7c460f45f7972 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll @@ -5,8 +5,6 @@ target triple = "aarch64-unknown-linux-gnu" -; TODO: Improve codegen for non-zero extract indices. - define void @test_str_lane_s32(ptr %a, %b) { ; CHECK-LABEL: test_str_lane_s32: ; CHECK: // %bb.0: // %entry @@ -17,8 +15,7 @@ define void @test_str_lane_s32(ptr %a, %b) { ; STREAMING-COMPAT-LABEL: test_str_lane_s32: ; STREAMING-COMPAT: // %bb.0: // %entry ; STREAMING-COMPAT-NEXT: mov z0.s, z0.s[3] -; STREAMING-COMPAT-NEXT: fmov w8, s0 -; STREAMING-COMPAT-NEXT: str w8, [x0] +; STREAMING-COMPAT-NEXT: str s0, [x0] ; STREAMING-COMPAT-NEXT: ret entry: @@ -44,24 +41,6 @@ entry: ret void } -define void @test_str_lane0_s32_negative_offset(ptr %a, %b) { -; CHECK-LABEL: test_str_lane0_s32_negative_offset: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: stur s0, [x0, #-32] -; CHECK-NEXT: ret -; -; STREAMING-COMPAT-LABEL: test_str_lane0_s32_negative_offset: -; STREAMING-COMPAT: // %bb.0: // %entry -; STREAMING-COMPAT-NEXT: stur s0, [x0, #-32] -; STREAMING-COMPAT-NEXT: ret - -entry: - %0 = extractelement %b, i32 0 - %out_ptr = getelementptr inbounds i32, ptr %a, i64 -8 - store i32 %0, ptr %out_ptr, align 4 - ret void -} - define void @test_str_lane_s64(ptr %a, %b) { ; CHECK-LABEL: test_str_lane_s64: ; CHECK: // %bb.0: // %entry @@ -72,8 +51,7 @@ define void @test_str_lane_s64(ptr %a, %b) { ; STREAMING-COMPAT-LABEL: test_str_lane_s64: ; STREAMING-COMPAT: // %bb.0: // %entry ; STREAMING-COMPAT-NEXT: mov z0.d, z0.d[1] -; STREAMING-COMPAT-NEXT: fmov x8, d0 -; STREAMING-COMPAT-NEXT: str x8, [x0] +; STREAMING-COMPAT-NEXT: str d0, [x0] ; STREAMING-COMPAT-NEXT: ret entry: @@ -191,6 +169,25 @@ entry: ret void } +define void @test_str_lane0_s8(ptr %a, %b) { +; CHECK-LABEL: test_str_lane0_s8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: strb w8, [x0] +; CHECK-NEXT: ret +; +; STREAMING-COMPAT-LABEL: test_str_lane0_s8: +; STREAMING-COMPAT: // %bb.0: // %entry +; STREAMING-COMPAT-NEXT: fmov w8, s0 +; STREAMING-COMPAT-NEXT: strb w8, [x0] +; STREAMING-COMPAT-NEXT: ret + +entry: + %0 = extractelement %b, i32 0 + store i8 %0, ptr %a, align 1 + ret void +} + define void @test_str_lane_s16(ptr %a, %b) { ; CHECK-LABEL: test_str_lane_s16: ; CHECK: // %bb.0: // %entry @@ -201,8 +198,7 @@ define void @test_str_lane_s16(ptr %a, %b) { ; STREAMING-COMPAT-LABEL: test_str_lane_s16: ; STREAMING-COMPAT: // %bb.0: // %entry ; STREAMING-COMPAT-NEXT: mov z0.h, z0.h[3] -; STREAMING-COMPAT-NEXT: fmov w8, s0 -; STREAMING-COMPAT-NEXT: strh w8, [x0] +; STREAMING-COMPAT-NEXT: str h0, [x0] ; STREAMING-COMPAT-NEXT: ret entry: @@ -342,3 +338,236 @@ define void @test_str_reduction_i32_to_i16_negative_offset(ptr %ptr, %b) { +; CHECK-LABEL: test_str_lane_s32_negative_offset: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, v0.s[3] +; CHECK-NEXT: stur w8, [x0, #-32] +; CHECK-NEXT: ret +; +; STREAMING-COMPAT-LABEL: test_str_lane_s32_negative_offset: +; STREAMING-COMPAT: // %bb.0: // %entry +; STREAMING-COMPAT-NEXT: mov z0.s, z0.s[3] +; STREAMING-COMPAT-NEXT: stur s0, [x0, #-32] +; STREAMING-COMPAT-NEXT: ret + +entry: + %0 = extractelement %b, i32 3 + %out_ptr = getelementptr inbounds i32, ptr %a, i64 -8 + store i32 %0, ptr %out_ptr, align 4 + ret void +} + +define void @test_str_lane0_s32_negative_offset(ptr %a, %b) { +; CHECK-LABEL: test_str_lane0_s32_negative_offset: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stur s0, [x0, #-32] +; CHECK-NEXT: ret +; +; STREAMING-COMPAT-LABEL: test_str_lane0_s32_negative_offset: +; STREAMING-COMPAT: // %bb.0: // %entry +; STREAMING-COMPAT-NEXT: stur s0, [x0, #-32] +; STREAMING-COMPAT-NEXT: ret + +entry: + %0 = extractelement %b, i32 0 + %out_ptr = getelementptr inbounds i32, ptr %a, i64 -8 + store i32 %0, ptr %out_ptr, align 4 + ret void +} + +define void @test_str_lane_s64_negative_offset(ptr %a, %b) { +; CHECK-LABEL: test_str_lane_s64_negative_offset: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov x8, v0.d[1] +; CHECK-NEXT: stur x8, [x0, #-64] +; CHECK-NEXT: ret +; +; STREAMING-COMPAT-LABEL: test_str_lane_s64_negative_offset: +; STREAMING-COMPAT: // %bb.0: // %entry +; STREAMING-COMPAT-NEXT: mov z0.d, z0.d[1] +; STREAMING-COMPAT-NEXT: stur d0, [x0, #-64] +; STREAMING-COMPAT-NEXT: ret + +entry: + %0 = extractelement %b, i32 1 + %out_ptr = getelementptr inbounds i64, ptr %a, i64 -8 + store i64 %0, ptr %out_ptr, align 8 + ret void +} + +define void @test_str_lane0_s64_negative_offset(ptr %a, %b) { +; CHECK-LABEL: test_str_lane0_s64_negative_offset: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stur d0, [x0, #-64] +; CHECK-NEXT: ret +; +; STREAMING-COMPAT-LABEL: test_str_lane0_s64_negative_offset: +; STREAMING-COMPAT: // %bb.0: // %entry +; STREAMING-COMPAT-NEXT: stur d0, [x0, #-64] +; STREAMING-COMPAT-NEXT: ret + +entry: + %0 = extractelement %b, i32 0 + %out_ptr = getelementptr inbounds i64, ptr %a, i64 -8 + store i64 %0, ptr %out_ptr, align 8 + ret void +} + +define void @test_str_lane_s8_negative_offset(ptr %a, %b) { +; CHECK-LABEL: test_str_lane_s8_negative_offset: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: umov w8, v0.b[7] +; CHECK-NEXT: sturb w8, [x0, #-8] +; CHECK-NEXT: ret +; +; STREAMING-COMPAT-LABEL: test_str_lane_s8_negative_offset: +; STREAMING-COMPAT: // %bb.0: // %entry +; STREAMING-COMPAT-NEXT: mov z0.b, z0.b[7] +; STREAMING-COMPAT-NEXT: fmov w8, s0 +; STREAMING-COMPAT-NEXT: sturb w8, [x0, #-8] +; STREAMING-COMPAT-NEXT: ret + +entry: + %0 = extractelement %b, i32 7 + %out_ptr = getelementptr inbounds i8, ptr %a, i64 -8 + store i8 %0, ptr %out_ptr, align 1 + ret void +} + +define void @test_str_lane0_s8_negative_offset(ptr %a, %b) { +; CHECK-LABEL: test_str_lane0_s8_negative_offset: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: sturb w8, [x0, #-8] +; CHECK-NEXT: ret +; +; STREAMING-COMPAT-LABEL: test_str_lane0_s8_negative_offset: +; STREAMING-COMPAT: // %bb.0: // %entry +; STREAMING-COMPAT-NEXT: fmov w8, s0 +; STREAMING-COMPAT-NEXT: sturb w8, [x0, #-8] +; STREAMING-COMPAT-NEXT: ret + +entry: + %0 = extractelement %b, i32 0 + %out_ptr = getelementptr inbounds i8, ptr %a, i64 -8 + store i8 %0, ptr %out_ptr, align 1 + ret void +} + +define void @test_str_lane_s16_negative_offset(ptr %a, %b) { +; CHECK-LABEL: test_str_lane_s16_negative_offset: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: umov w8, v0.h[3] +; CHECK-NEXT: sturh w8, [x0, #-16] +; CHECK-NEXT: ret +; +; STREAMING-COMPAT-LABEL: test_str_lane_s16_negative_offset: +; STREAMING-COMPAT: // %bb.0: // %entry +; STREAMING-COMPAT-NEXT: mov z0.h, z0.h[3] +; STREAMING-COMPAT-NEXT: stur h0, [x0, #-16] +; STREAMING-COMPAT-NEXT: ret + +entry: + %0 = extractelement %b, i32 3 + %out_ptr = getelementptr inbounds i16, ptr %a, i64 -8 + store i16 %0, ptr %out_ptr, align 2 + ret void +} + +define void @test_str_lane0_s16_negative_offset(ptr %a, %b) { +; CHECK-LABEL: test_str_lane0_s16_negative_offset: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stur h0, [x0, #-16] +; CHECK-NEXT: ret +; +; STREAMING-COMPAT-LABEL: test_str_lane0_s16_negative_offset: +; STREAMING-COMPAT: // %bb.0: // %entry +; STREAMING-COMPAT-NEXT: stur h0, [x0, #-16] +; STREAMING-COMPAT-NEXT: ret + +entry: + %0 = extractelement %b, i32 0 + %out_ptr = getelementptr inbounds i16, ptr %a, i64 -8 + store i16 %0, ptr %out_ptr, align 2 + ret void +} + +define void @test_str_trunc_lane_s32_to_s16(ptr %a, %b) { +; CHECK-LABEL: test_str_trunc_lane_s32_to_s16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, v0.s[3] +; CHECK-NEXT: strh w8, [x0] +; CHECK-NEXT: ret +; +; STREAMING-COMPAT-LABEL: test_str_trunc_lane_s32_to_s16: +; STREAMING-COMPAT: // %bb.0: // %entry +; STREAMING-COMPAT-NEXT: mov z0.s, z0.s[3] +; STREAMING-COMPAT-NEXT: str h0, [x0] +; STREAMING-COMPAT-NEXT: ret + +entry: + %0 = extractelement %b, i32 3 + %trunc = trunc i32 %0 to i16 + store i16 %trunc, ptr %a, align 2 + ret void +} + +define void @test_str_trunc_lane0_s32_to_s16(ptr %a, %b) { +; CHECK-LABEL: test_str_trunc_lane0_s32_to_s16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str h0, [x0] +; CHECK-NEXT: ret +; +; STREAMING-COMPAT-LABEL: test_str_trunc_lane0_s32_to_s16: +; STREAMING-COMPAT: // %bb.0: // %entry +; STREAMING-COMPAT-NEXT: str h0, [x0] +; STREAMING-COMPAT-NEXT: ret + +entry: + %0 = extractelement %b, i32 0 + %trunc = trunc i32 %0 to i16 + store i16 %trunc, ptr %a, align 2 + ret void +} + +define void @test_str_trunc_lane_s32_to_s16_negative_offset(ptr %a, %b) { +; CHECK-LABEL: test_str_trunc_lane_s32_to_s16_negative_offset: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, v0.s[3] +; CHECK-NEXT: sturh w8, [x0, #-16] +; CHECK-NEXT: ret +; +; STREAMING-COMPAT-LABEL: test_str_trunc_lane_s32_to_s16_negative_offset: +; STREAMING-COMPAT: // %bb.0: // %entry +; STREAMING-COMPAT-NEXT: mov z0.s, z0.s[3] +; STREAMING-COMPAT-NEXT: stur h0, [x0, #-16] +; STREAMING-COMPAT-NEXT: ret + +entry: + %0 = extractelement %b, i32 3 + %trunc = trunc i32 %0 to i16 + %out_ptr = getelementptr inbounds i16, ptr %a, i64 -8 + store i16 %trunc, ptr %out_ptr, align 2 + ret void +} + +define void @test_str_trunc_lane0_s32_to_s16_negative_offset(ptr %a, %b) { +; CHECK-LABEL: test_str_trunc_lane0_s32_to_s16_negative_offset: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stur h0, [x0, #-16] +; CHECK-NEXT: ret +; +; STREAMING-COMPAT-LABEL: test_str_trunc_lane0_s32_to_s16_negative_offset: +; STREAMING-COMPAT: // %bb.0: // %entry +; STREAMING-COMPAT-NEXT: stur h0, [x0, #-16] +; STREAMING-COMPAT-NEXT: ret + +entry: + %0 = extractelement %b, i32 0 + %trunc = trunc i32 %0 to i16 + %out_ptr = getelementptr inbounds i16, ptr %a, i64 -8 + store i16 %trunc, ptr %out_ptr, align 2 + ret void +} diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll index 9e1d342663f0f..2c891251befc7 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll @@ -268,24 +268,26 @@ define <4 x i256> @load_sext_v4i32i256(ptr %ap) { ; CHECK-NEXT: sunpklo z1.d, z0.s ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: mov z2.d, z1.d[1] ; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: mov z2.d, z1.d[1] ; CHECK-NEXT: str d1, [x8] +; CHECK-NEXT: fmov x10, d0 +; CHECK-NEXT: asr x9, x9, #63 +; CHECK-NEXT: mov z1.d, z0.d[1] ; CHECK-NEXT: str d0, [x8, #64] +; CHECK-NEXT: stp x9, x9, [x8, #16] +; CHECK-NEXT: str x9, [x8, #8] +; CHECK-NEXT: asr x9, x10, #63 ; CHECK-NEXT: fmov x10, d2 -; CHECK-NEXT: fmov x11, d0 -; CHECK-NEXT: mov z0.d, z0.d[1] +; CHECK-NEXT: str d2, [x8, #32] +; CHECK-NEXT: stp x9, x9, [x8, #80] +; CHECK-NEXT: str x9, [x8, #72] +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: asr x10, x10, #63 +; CHECK-NEXT: str d1, [x8, #96] +; CHECK-NEXT: stp x10, x10, [x8, #48] ; CHECK-NEXT: asr x9, x9, #63 -; CHECK-NEXT: stp x9, x9, [x8, #8] -; CHECK-NEXT: asr x11, x11, #63 -; CHECK-NEXT: stp x9, x10, [x8, #24] -; CHECK-NEXT: asr x9, x10, #63 -; CHECK-NEXT: fmov x10, d0 -; CHECK-NEXT: stp x11, x11, [x8, #72] -; CHECK-NEXT: stp x9, x9, [x8, #48] -; CHECK-NEXT: str x9, [x8, #40] -; CHECK-NEXT: asr x9, x10, #63 -; CHECK-NEXT: stp x11, x10, [x8, #88] +; CHECK-NEXT: str x10, [x8, #40] ; CHECK-NEXT: stp x9, x9, [x8, #112] ; CHECK-NEXT: str x9, [x8, #104] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll index 434e24bf48724..d9f8482a3c503 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll @@ -125,11 +125,10 @@ define void @masked_scatter_v2i64(ptr %a, ptr %b) vscale_range(2, 2) { ; CHECK-NEXT: str d0, [x9] ; CHECK-NEXT: tbz w8, #1, .LBB1_2 ; CHECK-NEXT: .LBB1_4: // %cond.store1 -; CHECK-NEXT: mov z0.d, z0.d[1] ; CHECK-NEXT: mov z1.d, z1.d[1] -; CHECK-NEXT: fmov x8, d0 -; CHECK-NEXT: fmov x9, d1 -; CHECK-NEXT: str x8, [x9] +; CHECK-NEXT: mov z0.d, z0.d[1] +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: str d0, [x8] ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret ; From 446ae821958a9f6e1d5c17ef6f6f198bbfee736e Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Thu, 6 Mar 2025 12:10:42 +0000 Subject: [PATCH 04/11] Avoid duplicate test checks --- .../CodeGen/AArch64/aarch64-sve-ldst-one.ll | 216 +++++------------- 1 file changed, 53 insertions(+), 163 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll index 7c460f45f7972..2278bc82fcf6e 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll @@ -1,16 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 | FileCheck %s --check-prefixes=CHECK -; RUN: llc < %s -verify-machineinstrs -mattr=+sme -global-isel=0 -force-streaming | FileCheck %s --check-prefixes=STREAMING-COMPAT -; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 -force-streaming-compatible | FileCheck %s --check-prefixes=STREAMING-COMPAT +; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 | FileCheck %s --check-prefixes=CHECK,CHECK-NONSTREAMING +; RUN: llc < %s -verify-machineinstrs -mattr=+sme -global-isel=0 -force-streaming | FileCheck %s --check-prefixes=CHECK,STREAMING-COMPAT +; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 -force-streaming-compatible | FileCheck %s --check-prefixes=CHECK,STREAMING-COMPAT target triple = "aarch64-unknown-linux-gnu" define void @test_str_lane_s32(ptr %a, %b) { -; CHECK-LABEL: test_str_lane_s32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, v0.s[3] -; CHECK-NEXT: str w8, [x0] -; CHECK-NEXT: ret +; CHECK-NONSTREAMING-LABEL: test_str_lane_s32: +; CHECK-NONSTREAMING: // %bb.0: // %entry +; CHECK-NONSTREAMING-NEXT: mov w8, v0.s[3] +; CHECK-NONSTREAMING-NEXT: str w8, [x0] +; CHECK-NONSTREAMING-NEXT: ret ; ; STREAMING-COMPAT-LABEL: test_str_lane_s32: ; STREAMING-COMPAT: // %bb.0: // %entry @@ -29,11 +29,6 @@ define void @test_str_lane0_s32(ptr %a, %b) { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str s0, [x0] ; CHECK-NEXT: ret -; -; STREAMING-COMPAT-LABEL: test_str_lane0_s32: -; STREAMING-COMPAT: // %bb.0: // %entry -; STREAMING-COMPAT-NEXT: str s0, [x0] -; STREAMING-COMPAT-NEXT: ret entry: %0 = extractelement %b, i32 0 @@ -42,11 +37,11 @@ entry: } define void @test_str_lane_s64(ptr %a, %b) { -; CHECK-LABEL: test_str_lane_s64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov x8, v0.d[1] -; CHECK-NEXT: str x8, [x0] -; CHECK-NEXT: ret +; CHECK-NONSTREAMING-LABEL: test_str_lane_s64: +; CHECK-NONSTREAMING: // %bb.0: // %entry +; CHECK-NONSTREAMING-NEXT: mov x8, v0.d[1] +; CHECK-NONSTREAMING-NEXT: str x8, [x0] +; CHECK-NONSTREAMING-NEXT: ret ; ; STREAMING-COMPAT-LABEL: test_str_lane_s64: ; STREAMING-COMPAT: // %bb.0: // %entry @@ -65,11 +60,6 @@ define void @test_str_lane0_s64(ptr %a, %b) { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret -; -; STREAMING-COMPAT-LABEL: test_str_lane0_s64: -; STREAMING-COMPAT: // %bb.0: // %entry -; STREAMING-COMPAT-NEXT: str d0, [x0] -; STREAMING-COMPAT-NEXT: ret entry: %0 = extractelement %b, i32 0 @@ -83,12 +73,6 @@ define void @test_str_lane_f32(ptr %a, %b) { ; CHECK-NEXT: mov z0.s, z0.s[3] ; CHECK-NEXT: str s0, [x0] ; CHECK-NEXT: ret -; -; STREAMING-COMPAT-LABEL: test_str_lane_f32: -; STREAMING-COMPAT: // %bb.0: // %entry -; STREAMING-COMPAT-NEXT: mov z0.s, z0.s[3] -; STREAMING-COMPAT-NEXT: str s0, [x0] -; STREAMING-COMPAT-NEXT: ret entry: %0 = extractelement %b, i32 3 @@ -101,11 +85,6 @@ define void @test_str_lane0_f32(ptr %a, %b) { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str s0, [x0] ; CHECK-NEXT: ret -; -; STREAMING-COMPAT-LABEL: test_str_lane0_f32: -; STREAMING-COMPAT: // %bb.0: // %entry -; STREAMING-COMPAT-NEXT: str s0, [x0] -; STREAMING-COMPAT-NEXT: ret entry: %0 = extractelement %b, i32 0 @@ -119,12 +98,6 @@ define void @test_str_lane_f64(ptr %a, %b) { ; CHECK-NEXT: mov z0.d, z0.d[1] ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret -; -; STREAMING-COMPAT-LABEL: test_str_lane_f64: -; STREAMING-COMPAT: // %bb.0: // %entry -; STREAMING-COMPAT-NEXT: mov z0.d, z0.d[1] -; STREAMING-COMPAT-NEXT: str d0, [x0] -; STREAMING-COMPAT-NEXT: ret entry: %0 = extractelement %b, i32 1 @@ -137,11 +110,6 @@ define void @test_str_lane0_f64(ptr %a, %b) { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret -; -; STREAMING-COMPAT-LABEL: test_str_lane0_f64: -; STREAMING-COMPAT: // %bb.0: // %entry -; STREAMING-COMPAT-NEXT: str d0, [x0] -; STREAMING-COMPAT-NEXT: ret entry: %0 = extractelement %b, i32 0 @@ -150,11 +118,11 @@ entry: } define void @test_str_lane_s8(ptr %a, %b) { -; CHECK-LABEL: test_str_lane_s8: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: umov w8, v0.b[7] -; CHECK-NEXT: strb w8, [x0] -; CHECK-NEXT: ret +; CHECK-NONSTREAMING-LABEL: test_str_lane_s8: +; CHECK-NONSTREAMING: // %bb.0: // %entry +; CHECK-NONSTREAMING-NEXT: umov w8, v0.b[7] +; CHECK-NONSTREAMING-NEXT: strb w8, [x0] +; CHECK-NONSTREAMING-NEXT: ret ; ; STREAMING-COMPAT-LABEL: test_str_lane_s8: ; STREAMING-COMPAT: // %bb.0: // %entry @@ -175,12 +143,6 @@ define void @test_str_lane0_s8(ptr %a, %b) { ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: strb w8, [x0] ; CHECK-NEXT: ret -; -; STREAMING-COMPAT-LABEL: test_str_lane0_s8: -; STREAMING-COMPAT: // %bb.0: // %entry -; STREAMING-COMPAT-NEXT: fmov w8, s0 -; STREAMING-COMPAT-NEXT: strb w8, [x0] -; STREAMING-COMPAT-NEXT: ret entry: %0 = extractelement %b, i32 0 @@ -189,11 +151,11 @@ entry: } define void @test_str_lane_s16(ptr %a, %b) { -; CHECK-LABEL: test_str_lane_s16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: umov w8, v0.h[3] -; CHECK-NEXT: strh w8, [x0] -; CHECK-NEXT: ret +; CHECK-NONSTREAMING-LABEL: test_str_lane_s16: +; CHECK-NONSTREAMING: // %bb.0: // %entry +; CHECK-NONSTREAMING-NEXT: umov w8, v0.h[3] +; CHECK-NONSTREAMING-NEXT: strh w8, [x0] +; CHECK-NONSTREAMING-NEXT: ret ; ; STREAMING-COMPAT-LABEL: test_str_lane_s16: ; STREAMING-COMPAT: // %bb.0: // %entry @@ -212,11 +174,6 @@ define void @test_str_lane0_s16(ptr %a, %b) { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str h0, [x0] ; CHECK-NEXT: ret -; -; STREAMING-COMPAT-LABEL: test_str_lane0_s16: -; STREAMING-COMPAT: // %bb.0: // %entry -; STREAMING-COMPAT-NEXT: str h0, [x0] -; STREAMING-COMPAT-NEXT: ret entry: %0 = extractelement %b, i32 0 @@ -230,12 +187,6 @@ define void @test_str_reduction_i32_to_i32(ptr %ptr, %p0, %p0, %v) %trunc = trunc i64 %reduce to i32 @@ -249,12 +200,6 @@ define void @test_str_reduction_i32_to_i64(ptr %ptr, %p0, %p0, %v) store i64 %reduce, ptr %ptr, align 8 @@ -267,12 +212,6 @@ define void @test_str_reduction_i32_to_i16(ptr %ptr, %p0, %p0, %v) %trunc = trunc i64 %reduce to i16 @@ -286,12 +225,6 @@ define void @test_str_reduction_i32_to_i32_negative_offset(ptr %ptr, %p0, %v) %trunc = trunc i64 %reduce to i32 @@ -306,12 +239,6 @@ define void @test_str_reduction_i32_to_i64_negative_offset(ptr %ptr, %p0, %v) %out_ptr = getelementptr inbounds i64, ptr %ptr, i64 -8 @@ -325,12 +252,6 @@ define void @test_str_reduction_i32_to_i16_negative_offset(ptr %ptr, %p0, %v) %trunc = trunc i64 %reduce to i16 @@ -340,11 +261,11 @@ define void @test_str_reduction_i32_to_i16_negative_offset(ptr %ptr, %b) { -; CHECK-LABEL: test_str_lane_s32_negative_offset: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, v0.s[3] -; CHECK-NEXT: stur w8, [x0, #-32] -; CHECK-NEXT: ret +; CHECK-NONSTREAMING-LABEL: test_str_lane_s32_negative_offset: +; CHECK-NONSTREAMING: // %bb.0: // %entry +; CHECK-NONSTREAMING-NEXT: mov w8, v0.s[3] +; CHECK-NONSTREAMING-NEXT: stur w8, [x0, #-32] +; CHECK-NONSTREAMING-NEXT: ret ; ; STREAMING-COMPAT-LABEL: test_str_lane_s32_negative_offset: ; STREAMING-COMPAT: // %bb.0: // %entry @@ -364,11 +285,6 @@ define void @test_str_lane0_s32_negative_offset(ptr %a, %b) { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: stur s0, [x0, #-32] ; CHECK-NEXT: ret -; -; STREAMING-COMPAT-LABEL: test_str_lane0_s32_negative_offset: -; STREAMING-COMPAT: // %bb.0: // %entry -; STREAMING-COMPAT-NEXT: stur s0, [x0, #-32] -; STREAMING-COMPAT-NEXT: ret entry: %0 = extractelement %b, i32 0 @@ -378,11 +294,11 @@ entry: } define void @test_str_lane_s64_negative_offset(ptr %a, %b) { -; CHECK-LABEL: test_str_lane_s64_negative_offset: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov x8, v0.d[1] -; CHECK-NEXT: stur x8, [x0, #-64] -; CHECK-NEXT: ret +; CHECK-NONSTREAMING-LABEL: test_str_lane_s64_negative_offset: +; CHECK-NONSTREAMING: // %bb.0: // %entry +; CHECK-NONSTREAMING-NEXT: mov x8, v0.d[1] +; CHECK-NONSTREAMING-NEXT: stur x8, [x0, #-64] +; CHECK-NONSTREAMING-NEXT: ret ; ; STREAMING-COMPAT-LABEL: test_str_lane_s64_negative_offset: ; STREAMING-COMPAT: // %bb.0: // %entry @@ -402,11 +318,6 @@ define void @test_str_lane0_s64_negative_offset(ptr %a, %b) { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: stur d0, [x0, #-64] ; CHECK-NEXT: ret -; -; STREAMING-COMPAT-LABEL: test_str_lane0_s64_negative_offset: -; STREAMING-COMPAT: // %bb.0: // %entry -; STREAMING-COMPAT-NEXT: stur d0, [x0, #-64] -; STREAMING-COMPAT-NEXT: ret entry: %0 = extractelement %b, i32 0 @@ -416,11 +327,11 @@ entry: } define void @test_str_lane_s8_negative_offset(ptr %a, %b) { -; CHECK-LABEL: test_str_lane_s8_negative_offset: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: umov w8, v0.b[7] -; CHECK-NEXT: sturb w8, [x0, #-8] -; CHECK-NEXT: ret +; CHECK-NONSTREAMING-LABEL: test_str_lane_s8_negative_offset: +; CHECK-NONSTREAMING: // %bb.0: // %entry +; CHECK-NONSTREAMING-NEXT: umov w8, v0.b[7] +; CHECK-NONSTREAMING-NEXT: sturb w8, [x0, #-8] +; CHECK-NONSTREAMING-NEXT: ret ; ; STREAMING-COMPAT-LABEL: test_str_lane_s8_negative_offset: ; STREAMING-COMPAT: // %bb.0: // %entry @@ -442,12 +353,6 @@ define void @test_str_lane0_s8_negative_offset(ptr %a, %b) { ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: sturb w8, [x0, #-8] ; CHECK-NEXT: ret -; -; STREAMING-COMPAT-LABEL: test_str_lane0_s8_negative_offset: -; STREAMING-COMPAT: // %bb.0: // %entry -; STREAMING-COMPAT-NEXT: fmov w8, s0 -; STREAMING-COMPAT-NEXT: sturb w8, [x0, #-8] -; STREAMING-COMPAT-NEXT: ret entry: %0 = extractelement %b, i32 0 @@ -457,11 +362,11 @@ entry: } define void @test_str_lane_s16_negative_offset(ptr %a, %b) { -; CHECK-LABEL: test_str_lane_s16_negative_offset: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: umov w8, v0.h[3] -; CHECK-NEXT: sturh w8, [x0, #-16] -; CHECK-NEXT: ret +; CHECK-NONSTREAMING-LABEL: test_str_lane_s16_negative_offset: +; CHECK-NONSTREAMING: // %bb.0: // %entry +; CHECK-NONSTREAMING-NEXT: umov w8, v0.h[3] +; CHECK-NONSTREAMING-NEXT: sturh w8, [x0, #-16] +; CHECK-NONSTREAMING-NEXT: ret ; ; STREAMING-COMPAT-LABEL: test_str_lane_s16_negative_offset: ; STREAMING-COMPAT: // %bb.0: // %entry @@ -481,11 +386,6 @@ define void @test_str_lane0_s16_negative_offset(ptr %a, %b) { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: stur h0, [x0, #-16] ; CHECK-NEXT: ret -; -; STREAMING-COMPAT-LABEL: test_str_lane0_s16_negative_offset: -; STREAMING-COMPAT: // %bb.0: // %entry -; STREAMING-COMPAT-NEXT: stur h0, [x0, #-16] -; STREAMING-COMPAT-NEXT: ret entry: %0 = extractelement %b, i32 0 @@ -495,11 +395,11 @@ entry: } define void @test_str_trunc_lane_s32_to_s16(ptr %a, %b) { -; CHECK-LABEL: test_str_trunc_lane_s32_to_s16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, v0.s[3] -; CHECK-NEXT: strh w8, [x0] -; CHECK-NEXT: ret +; CHECK-NONSTREAMING-LABEL: test_str_trunc_lane_s32_to_s16: +; CHECK-NONSTREAMING: // %bb.0: // %entry +; CHECK-NONSTREAMING-NEXT: mov w8, v0.s[3] +; CHECK-NONSTREAMING-NEXT: strh w8, [x0] +; CHECK-NONSTREAMING-NEXT: ret ; ; STREAMING-COMPAT-LABEL: test_str_trunc_lane_s32_to_s16: ; STREAMING-COMPAT: // %bb.0: // %entry @@ -519,11 +419,6 @@ define void @test_str_trunc_lane0_s32_to_s16(ptr %a, %b) { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str h0, [x0] ; CHECK-NEXT: ret -; -; STREAMING-COMPAT-LABEL: test_str_trunc_lane0_s32_to_s16: -; STREAMING-COMPAT: // %bb.0: // %entry -; STREAMING-COMPAT-NEXT: str h0, [x0] -; STREAMING-COMPAT-NEXT: ret entry: %0 = extractelement %b, i32 0 @@ -533,11 +428,11 @@ entry: } define void @test_str_trunc_lane_s32_to_s16_negative_offset(ptr %a, %b) { -; CHECK-LABEL: test_str_trunc_lane_s32_to_s16_negative_offset: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, v0.s[3] -; CHECK-NEXT: sturh w8, [x0, #-16] -; CHECK-NEXT: ret +; CHECK-NONSTREAMING-LABEL: test_str_trunc_lane_s32_to_s16_negative_offset: +; CHECK-NONSTREAMING: // %bb.0: // %entry +; CHECK-NONSTREAMING-NEXT: mov w8, v0.s[3] +; CHECK-NONSTREAMING-NEXT: sturh w8, [x0, #-16] +; CHECK-NONSTREAMING-NEXT: ret ; ; STREAMING-COMPAT-LABEL: test_str_trunc_lane_s32_to_s16_negative_offset: ; STREAMING-COMPAT: // %bb.0: // %entry @@ -558,11 +453,6 @@ define void @test_str_trunc_lane0_s32_to_s16_negative_offset(ptr %a, %b, i32 0 From 40b1a948759590530d6798896d6355136d146f18 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Thu, 6 Mar 2025 14:45:17 +0000 Subject: [PATCH 05/11] Fixups --- .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 73 +++++----- .../CodeGen/AArch64/aarch64-sve-ldst-one.ll | 128 ++++++------------ 2 files changed, 75 insertions(+), 126 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 00d1ea3bf6432..723e01853baed 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -1833,49 +1833,46 @@ multiclass SVEVecStoreLanePat { - let Predicates = [HasSVE_or_SME] in { - // Same as Neon VecStoreLane0Pat but without matching VecListOne128. - def : Pat<(storeop (STy (vector_extract VTy:$Vt, (i64 0))), - (UIAddrMode GPR64sp:$Rn, IndexType:$offset)), - (STR (SubRegTy (EXTRACT_SUBREG $Vt, SubRegIdx)), - GPR64sp:$Rn, IndexType:$offset)>; - } + // Same as Neon VecStoreLane0Pat but without matching VecListOne128. + def : Pat<(storeop (STy (vector_extract VTy:$Vt, (i64 0))), + (UIAddrMode GPR64sp:$Rn, IndexType:$offset)), + (STR (SubRegTy (EXTRACT_SUBREG $Vt, SubRegIdx)), + GPR64sp:$Rn, IndexType:$offset)>; // Only used for streaming[-compatible] SVE -- when NEON is available we avoid a DUP. - let Predicates = [HasSVE_or_SME, HasNoNEON] in { - // Non-zero immediate index: - def : Pat<(storeop (STy (vector_extract VTy:$Vt, DUPIdxTy:$idx)), - (UIAddrMode GPR64sp:$Rn, IndexType:$offset)), - (STR (SubRegTy (EXTRACT_SUBREG (DUP $Vt, DUPIdxTy:$idx), SubRegIdx)), - GPR64sp:$Rn, IndexType:$offset)>; - } -} - -let AddedComplexity = 19 in { - // Lane 0 truncating stores - // i32 -> i16 - defm : SVEVecStoreLanePat; - defm : SVEVecStoreLanePat; - // i64 -> i32 - defm : SVEVecStoreLanePat; - defm : SVEVecStoreLanePat; - // i64 -> i16 - defm : SVEVecStoreLanePat; - defm : SVEVecStoreLanePat; - // i16 -> i16 (technically a truncate as the extracted type is i32) - defm : SVEVecStoreLanePat; - defm : SVEVecStoreLanePat; - - // Lane 0 stores - // i32 - defm : SVEVecStoreLanePat; - defm : SVEVecStoreLanePat; - // i64 - defm : SVEVecStoreLanePat; - defm : SVEVecStoreLanePat; + // Non-zero immediate index: + def : Pat<(storeop (STy (vector_extract VTy:$Vt, DUPIdxTy:$idx)), + (UIAddrMode GPR64sp:$Rn, IndexType:$offset)), + (STR (SubRegTy (EXTRACT_SUBREG (DUP $Vt, DUPIdxTy:$idx), SubRegIdx)), + GPR64sp:$Rn, IndexType:$offset)>; } let Predicates = [HasSVE_or_SME] in { + + let AddedComplexity = 19 in { + // Lane 0 truncating stores + // i32 -> i16 + defm : SVEVecStoreLanePat; + defm : SVEVecStoreLanePat; + // i64 -> i32 + defm : SVEVecStoreLanePat; + defm : SVEVecStoreLanePat; + // i64 -> i16 + defm : SVEVecStoreLanePat; + defm : SVEVecStoreLanePat; + // i16 -> i16 (technically a truncate as the extracted type is i32) + defm : SVEVecStoreLanePat; + defm : SVEVecStoreLanePat; + + // Lane 0 stores + // i32 + defm : SVEVecStoreLanePat; + defm : SVEVecStoreLanePat; + // i64 + defm : SVEVecStoreLanePat; + defm : SVEVecStoreLanePat; + } + defm TBL_ZZZ : sve_int_perm_tbl<"tbl", AArch64tbl>; defm ZIP1_ZZZ : sve_int_perm_bin_perm_zz<0b000, "zip1", AArch64zip1>; diff --git a/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll index 2278bc82fcf6e..d39c9bf760621 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll @@ -6,17 +6,11 @@ target triple = "aarch64-unknown-linux-gnu" define void @test_str_lane_s32(ptr %a, %b) { -; CHECK-NONSTREAMING-LABEL: test_str_lane_s32: -; CHECK-NONSTREAMING: // %bb.0: // %entry -; CHECK-NONSTREAMING-NEXT: mov w8, v0.s[3] -; CHECK-NONSTREAMING-NEXT: str w8, [x0] -; CHECK-NONSTREAMING-NEXT: ret -; -; STREAMING-COMPAT-LABEL: test_str_lane_s32: -; STREAMING-COMPAT: // %bb.0: // %entry -; STREAMING-COMPAT-NEXT: mov z0.s, z0.s[3] -; STREAMING-COMPAT-NEXT: str s0, [x0] -; STREAMING-COMPAT-NEXT: ret +; CHECK-LABEL: test_str_lane_s32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.s, z0.s[3] +; CHECK-NEXT: str s0, [x0] +; CHECK-NEXT: ret entry: %0 = extractelement %b, i32 3 @@ -37,17 +31,11 @@ entry: } define void @test_str_lane_s64(ptr %a, %b) { -; CHECK-NONSTREAMING-LABEL: test_str_lane_s64: -; CHECK-NONSTREAMING: // %bb.0: // %entry -; CHECK-NONSTREAMING-NEXT: mov x8, v0.d[1] -; CHECK-NONSTREAMING-NEXT: str x8, [x0] -; CHECK-NONSTREAMING-NEXT: ret -; -; STREAMING-COMPAT-LABEL: test_str_lane_s64: -; STREAMING-COMPAT: // %bb.0: // %entry -; STREAMING-COMPAT-NEXT: mov z0.d, z0.d[1] -; STREAMING-COMPAT-NEXT: str d0, [x0] -; STREAMING-COMPAT-NEXT: ret +; CHECK-LABEL: test_str_lane_s64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.d, z0.d[1] +; CHECK-NEXT: str d0, [x0] +; CHECK-NEXT: ret entry: %0 = extractelement %b, i32 1 @@ -151,17 +139,11 @@ entry: } define void @test_str_lane_s16(ptr %a, %b) { -; CHECK-NONSTREAMING-LABEL: test_str_lane_s16: -; CHECK-NONSTREAMING: // %bb.0: // %entry -; CHECK-NONSTREAMING-NEXT: umov w8, v0.h[3] -; CHECK-NONSTREAMING-NEXT: strh w8, [x0] -; CHECK-NONSTREAMING-NEXT: ret -; -; STREAMING-COMPAT-LABEL: test_str_lane_s16: -; STREAMING-COMPAT: // %bb.0: // %entry -; STREAMING-COMPAT-NEXT: mov z0.h, z0.h[3] -; STREAMING-COMPAT-NEXT: str h0, [x0] -; STREAMING-COMPAT-NEXT: ret +; CHECK-LABEL: test_str_lane_s16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.h, z0.h[3] +; CHECK-NEXT: str h0, [x0] +; CHECK-NEXT: ret entry: %0 = extractelement %b, i32 3 @@ -261,17 +243,11 @@ define void @test_str_reduction_i32_to_i16_negative_offset(ptr %ptr, %b) { -; CHECK-NONSTREAMING-LABEL: test_str_lane_s32_negative_offset: -; CHECK-NONSTREAMING: // %bb.0: // %entry -; CHECK-NONSTREAMING-NEXT: mov w8, v0.s[3] -; CHECK-NONSTREAMING-NEXT: stur w8, [x0, #-32] -; CHECK-NONSTREAMING-NEXT: ret -; -; STREAMING-COMPAT-LABEL: test_str_lane_s32_negative_offset: -; STREAMING-COMPAT: // %bb.0: // %entry -; STREAMING-COMPAT-NEXT: mov z0.s, z0.s[3] -; STREAMING-COMPAT-NEXT: stur s0, [x0, #-32] -; STREAMING-COMPAT-NEXT: ret +; CHECK-LABEL: test_str_lane_s32_negative_offset: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.s, z0.s[3] +; CHECK-NEXT: stur s0, [x0, #-32] +; CHECK-NEXT: ret entry: %0 = extractelement %b, i32 3 @@ -294,17 +270,11 @@ entry: } define void @test_str_lane_s64_negative_offset(ptr %a, %b) { -; CHECK-NONSTREAMING-LABEL: test_str_lane_s64_negative_offset: -; CHECK-NONSTREAMING: // %bb.0: // %entry -; CHECK-NONSTREAMING-NEXT: mov x8, v0.d[1] -; CHECK-NONSTREAMING-NEXT: stur x8, [x0, #-64] -; CHECK-NONSTREAMING-NEXT: ret -; -; STREAMING-COMPAT-LABEL: test_str_lane_s64_negative_offset: -; STREAMING-COMPAT: // %bb.0: // %entry -; STREAMING-COMPAT-NEXT: mov z0.d, z0.d[1] -; STREAMING-COMPAT-NEXT: stur d0, [x0, #-64] -; STREAMING-COMPAT-NEXT: ret +; CHECK-LABEL: test_str_lane_s64_negative_offset: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.d, z0.d[1] +; CHECK-NEXT: stur d0, [x0, #-64] +; CHECK-NEXT: ret entry: %0 = extractelement %b, i32 1 @@ -362,17 +332,11 @@ entry: } define void @test_str_lane_s16_negative_offset(ptr %a, %b) { -; CHECK-NONSTREAMING-LABEL: test_str_lane_s16_negative_offset: -; CHECK-NONSTREAMING: // %bb.0: // %entry -; CHECK-NONSTREAMING-NEXT: umov w8, v0.h[3] -; CHECK-NONSTREAMING-NEXT: sturh w8, [x0, #-16] -; CHECK-NONSTREAMING-NEXT: ret -; -; STREAMING-COMPAT-LABEL: test_str_lane_s16_negative_offset: -; STREAMING-COMPAT: // %bb.0: // %entry -; STREAMING-COMPAT-NEXT: mov z0.h, z0.h[3] -; STREAMING-COMPAT-NEXT: stur h0, [x0, #-16] -; STREAMING-COMPAT-NEXT: ret +; CHECK-LABEL: test_str_lane_s16_negative_offset: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.h, z0.h[3] +; CHECK-NEXT: stur h0, [x0, #-16] +; CHECK-NEXT: ret entry: %0 = extractelement %b, i32 3 @@ -395,17 +359,11 @@ entry: } define void @test_str_trunc_lane_s32_to_s16(ptr %a, %b) { -; CHECK-NONSTREAMING-LABEL: test_str_trunc_lane_s32_to_s16: -; CHECK-NONSTREAMING: // %bb.0: // %entry -; CHECK-NONSTREAMING-NEXT: mov w8, v0.s[3] -; CHECK-NONSTREAMING-NEXT: strh w8, [x0] -; CHECK-NONSTREAMING-NEXT: ret -; -; STREAMING-COMPAT-LABEL: test_str_trunc_lane_s32_to_s16: -; STREAMING-COMPAT: // %bb.0: // %entry -; STREAMING-COMPAT-NEXT: mov z0.s, z0.s[3] -; STREAMING-COMPAT-NEXT: str h0, [x0] -; STREAMING-COMPAT-NEXT: ret +; CHECK-LABEL: test_str_trunc_lane_s32_to_s16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.s, z0.s[3] +; CHECK-NEXT: str h0, [x0] +; CHECK-NEXT: ret entry: %0 = extractelement %b, i32 3 @@ -428,17 +386,11 @@ entry: } define void @test_str_trunc_lane_s32_to_s16_negative_offset(ptr %a, %b) { -; CHECK-NONSTREAMING-LABEL: test_str_trunc_lane_s32_to_s16_negative_offset: -; CHECK-NONSTREAMING: // %bb.0: // %entry -; CHECK-NONSTREAMING-NEXT: mov w8, v0.s[3] -; CHECK-NONSTREAMING-NEXT: sturh w8, [x0, #-16] -; CHECK-NONSTREAMING-NEXT: ret -; -; STREAMING-COMPAT-LABEL: test_str_trunc_lane_s32_to_s16_negative_offset: -; STREAMING-COMPAT: // %bb.0: // %entry -; STREAMING-COMPAT-NEXT: mov z0.s, z0.s[3] -; STREAMING-COMPAT-NEXT: stur h0, [x0, #-16] -; STREAMING-COMPAT-NEXT: ret +; CHECK-LABEL: test_str_trunc_lane_s32_to_s16_negative_offset: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z0.s, z0.s[3] +; CHECK-NEXT: stur h0, [x0, #-16] +; CHECK-NEXT: ret entry: %0 = extractelement %b, i32 3 From 98865833804547ced178f53ddc098536338cbd70 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Thu, 6 Mar 2025 14:50:59 +0000 Subject: [PATCH 06/11] Fixups --- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index d374c1007dbe7..92a4890372025 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -134,7 +134,6 @@ def HasRDM : Predicate<"Subtarget->hasRDM()">, def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">, AssemblerPredicateWithAll<(all_of FeatureFullFP16), "fullfp16">; def HasNoFullFP16 : Predicate<"!Subtarget->hasFullFP16()">; -def HasNoNEON : Predicate<"!Subtarget->isNeonAvailable()">; def HasFP16FML : Predicate<"Subtarget->hasFP16FML()">, AssemblerPredicateWithAll<(all_of FeatureFP16FML), "fp16fml">; def HasSPE : Predicate<"Subtarget->hasSPE()">, From 17f34a65b24d79cce7014f8133f243d4545d5fe2 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Thu, 6 Mar 2025 19:36:18 +0000 Subject: [PATCH 07/11] Rewrite as DAG combine --- .../Target/AArch64/AArch64ISelLowering.cpp | 56 +++++++++++++++++-- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 4 +- .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 44 --------------- 3 files changed, 53 insertions(+), 51 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index a9b4965e32b4c..61347009eb2ad 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -23938,6 +23938,20 @@ static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG, return Chain; } +static int getFPSubregForVT(EVT VT) { + assert(VT.isSimple() && "Expected simple VT"); + switch (VT.getSimpleVT().SimpleTy) { + case MVT::f16: + return AArch64::hsub; + case MVT::f32: + return AArch64::ssub; + case MVT::f64: + return AArch64::dsub; + default: + llvm_unreachable("Unexpected VT!"); + } +} + static SDValue performSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, @@ -23998,17 +24012,49 @@ static SDValue performSTORECombine(SDNode *N, if (SDValue Store = combineBoolVectorAndTruncateStore(DAG, ST)) return Store; - if (ST->isTruncatingStore()) { - EVT StoreVT = ST->getMemoryVT(); - if (!isHalvingTruncateOfLegalScalableType(ValueVT, StoreVT)) - return SDValue(); + if (ST->isTruncatingStore() && + isHalvingTruncateOfLegalScalableType(ValueVT, MemVT)) { if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(ST->getOperand(1), DAG, Subtarget)) { return DAG.getTruncStore(ST->getChain(), ST, Rshrnb, ST->getBasePtr(), - StoreVT, ST->getMemOperand()); + MemVT, ST->getMemOperand()); } } + // This is an integer vector_extract_elt followed by a (possibly truncating) + // store. We may be able to replace this with a store of an FP subregister. + if (DCI.isAfterLegalizeDAG() && ST->isUnindexed() && + Value.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { + SDValue Vector = Value.getOperand(0); + SDValue ExtIdx = Value.getOperand(1); + EVT VectorVT = Vector.getValueType(); + EVT ElemVT = VectorVT.getVectorElementType(); + // TODO: Consider allowing Neon (a lot of churn, not necessarily better). + if (!VectorVT.isScalableVector()) + return SDValue(); + if (!ValueVT.isInteger() || ElemVT == MVT::i8 || MemVT == MVT::i8) + return SDValue(); + if (ValueVT != MemVT && !ST->isTruncatingStore()) + return SDValue(); + + EVT FPElemVT = EVT::getFloatingPointVT(ElemVT.getSizeInBits()); + EVT FPVectorVT = VectorVT.changeVectorElementType(FPElemVT); + SDValue Cast = DAG.getNode(ISD::BITCAST, DL, FPVectorVT, Vector); + SDValue Ext = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, FPElemVT, Cast, ExtIdx); + + EVT FPMemVT = EVT::getFloatingPointVT(MemVT.getSizeInBits()); + if (ST->isTruncatingStore() && FPMemVT != FPElemVT) { + SDValue Trunc = DAG.getTargetExtractSubreg(getFPSubregForVT(FPMemVT), DL, + FPMemVT, Ext); + return DAG.getStore(ST->getChain(), DL, Trunc, ST->getBasePtr(), + ST->getMemOperand()); + } + + return DAG.getStore(ST->getChain(), DL, Ext, ST->getBasePtr(), + ST->getMemOperand()); + } + return SDValue(); } diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 92a4890372025..6c61e3a613f6f 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -4579,6 +4579,8 @@ let Predicates = [IsLE] in { (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>; } +} // AddedComplexity = 10 + // unscaled i64 truncating stores def : Pat<(truncstorei32 GPR64:$Rt, (am_unscaled32 GPR64sp:$Rn, simm9:$offset)), (STURWi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>; @@ -4587,8 +4589,6 @@ def : Pat<(truncstorei16 GPR64:$Rt, (am_unscaled16 GPR64sp:$Rn, simm9:$offset)), def : Pat<(truncstorei8 GPR64:$Rt, (am_unscaled8 GPR64sp:$Rn, simm9:$offset)), (STURBBi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>; -} // AddedComplexity = 10 - // Match stores from lane 0 to the appropriate subreg's store. multiclass VecStoreULane0Pat; } // End HasSVE -multiclass SVEVecStoreLanePat { - // Same as Neon VecStoreLane0Pat but without matching VecListOne128. - def : Pat<(storeop (STy (vector_extract VTy:$Vt, (i64 0))), - (UIAddrMode GPR64sp:$Rn, IndexType:$offset)), - (STR (SubRegTy (EXTRACT_SUBREG $Vt, SubRegIdx)), - GPR64sp:$Rn, IndexType:$offset)>; - - // Only used for streaming[-compatible] SVE -- when NEON is available we avoid a DUP. - // Non-zero immediate index: - def : Pat<(storeop (STy (vector_extract VTy:$Vt, DUPIdxTy:$idx)), - (UIAddrMode GPR64sp:$Rn, IndexType:$offset)), - (STR (SubRegTy (EXTRACT_SUBREG (DUP $Vt, DUPIdxTy:$idx), SubRegIdx)), - GPR64sp:$Rn, IndexType:$offset)>; -} - let Predicates = [HasSVE_or_SME] in { - let AddedComplexity = 19 in { - // Lane 0 truncating stores - // i32 -> i16 - defm : SVEVecStoreLanePat; - defm : SVEVecStoreLanePat; - // i64 -> i32 - defm : SVEVecStoreLanePat; - defm : SVEVecStoreLanePat; - // i64 -> i16 - defm : SVEVecStoreLanePat; - defm : SVEVecStoreLanePat; - // i16 -> i16 (technically a truncate as the extracted type is i32) - defm : SVEVecStoreLanePat; - defm : SVEVecStoreLanePat; - - // Lane 0 stores - // i32 - defm : SVEVecStoreLanePat; - defm : SVEVecStoreLanePat; - // i64 - defm : SVEVecStoreLanePat; - defm : SVEVecStoreLanePat; - } - defm TBL_ZZZ : sve_int_perm_tbl<"tbl", AArch64tbl>; defm ZIP1_ZZZ : sve_int_perm_bin_perm_zz<0b000, "zip1", AArch64zip1>; From 473144fe323e32b1bb8237076edd0a0bcca8163b Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Thu, 6 Mar 2025 19:41:02 +0000 Subject: [PATCH 08/11] Rm whitespace --- llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 9ea488e6145ad..3ee71c14c6bd4 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -1828,7 +1828,6 @@ let Predicates = [HasSVE] in { } // End HasSVE let Predicates = [HasSVE_or_SME] in { - defm TBL_ZZZ : sve_int_perm_tbl<"tbl", AArch64tbl>; defm ZIP1_ZZZ : sve_int_perm_bin_perm_zz<0b000, "zip1", AArch64zip1>; From 2e7219e25ec9c06b7affc169f2d4fc48eabd6315 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Mon, 17 Mar 2025 10:48:43 +0000 Subject: [PATCH 09/11] Fixups --- .../Target/AArch64/AArch64ISelLowering.cpp | 21 +++++++++--- .../aarch64-neon-vector-insert-uaddlv.ll | 5 ++- llvm/test/CodeGen/AArch64/add.ll | 7 ++-- llvm/test/CodeGen/AArch64/andorxor.ll | 21 +++++------- llvm/test/CodeGen/AArch64/bitcast.ll | 14 ++++---- llvm/test/CodeGen/AArch64/mul.ll | 7 ++-- llvm/test/CodeGen/AArch64/neon-rshrn.ll | 7 ++-- llvm/test/CodeGen/AArch64/neon-truncstore.ll | 7 ++-- llvm/test/CodeGen/AArch64/sadd_sat_vec.ll | 7 ++-- llvm/test/CodeGen/AArch64/shufflevector.ll | 14 ++++---- llvm/test/CodeGen/AArch64/ssub_sat_vec.ll | 7 ++-- llvm/test/CodeGen/AArch64/store.ll | 23 ++++--------- llvm/test/CodeGen/AArch64/sub.ll | 7 ++-- ...e-streaming-mode-fixed-length-ext-loads.ll | 34 ++++++++----------- llvm/test/CodeGen/AArch64/uadd_sat_vec.ll | 7 ++-- llvm/test/CodeGen/AArch64/usub_sat_vec.ll | 7 ++-- 16 files changed, 89 insertions(+), 106 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 61347009eb2ad..918a65b8132be 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -23938,7 +23938,7 @@ static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG, return Chain; } -static int getFPSubregForVT(EVT VT) { +static unsigned getFPSubregForVT(EVT VT) { assert(VT.isSimple() && "Expected simple VT"); switch (VT.getSimpleVT().SimpleTy) { case MVT::f16: @@ -24025,18 +24025,31 @@ static SDValue performSTORECombine(SDNode *N, // store. We may be able to replace this with a store of an FP subregister. if (DCI.isAfterLegalizeDAG() && ST->isUnindexed() && Value.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { + SDValue Vector = Value.getOperand(0); SDValue ExtIdx = Value.getOperand(1); EVT VectorVT = Vector.getValueType(); EVT ElemVT = VectorVT.getVectorElementType(); - // TODO: Consider allowing Neon (a lot of churn, not necessarily better). - if (!VectorVT.isScalableVector()) - return SDValue(); if (!ValueVT.isInteger() || ElemVT == MVT::i8 || MemVT == MVT::i8) return SDValue(); if (ValueVT != MemVT && !ST->isTruncatingStore()) return SDValue(); + // Heuristic: If there are other users of integer scalars extracted from + // this vector that won't fold into the store -- abandon folding. This may + // extend the vector lifetime and disrupt paired stores. + for (auto Use = Vector->use_begin(), End = Vector->use_end(); Use != End; + ++Use) { + if (Use->getResNo() != Vector.getResNo()) + continue; + SDNode *User = Use->getUser(); + if (User->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { + if (!User->hasOneUse() || + (*User->user_begin())->getOpcode() != ISD::STORE) + return SDValue(); + } + } + EVT FPElemVT = EVT::getFloatingPointVT(ElemVT.getSizeInBits()); EVT FPVectorVT = VectorVT.changeVectorElementType(FPElemVT); SDValue Cast = DAG.getNode(ISD::BITCAST, DL, FPVectorVT, Vector); diff --git a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll index 1b7bc128d6332..b357a24f892ff 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll @@ -70,10 +70,10 @@ define void @insert_vec_v23i32_uaddlv_from_v8i16(ptr %0) { ; CHECK-NEXT: movi.2d v0, #0000000000000000 ; CHECK-NEXT: movi.2d v2, #0000000000000000 ; CHECK-NEXT: str wzr, [x0, #88] +; CHECK-NEXT: str xzr, [x0, #80] ; CHECK-NEXT: uaddlv.8h s1, v0 ; CHECK-NEXT: stp q0, q0, [x0, #16] ; CHECK-NEXT: stp q0, q0, [x0, #48] -; CHECK-NEXT: str d0, [x0, #80] ; CHECK-NEXT: mov.s v2[0], v1[0] ; CHECK-NEXT: ucvtf.4s v1, v2 ; CHECK-NEXT: str q1, [x0] @@ -146,11 +146,10 @@ define void @insert_vec_v6i64_uaddlv_from_v4i32(ptr %0) { ; CHECK-LABEL: insert_vec_v6i64_uaddlv_from_v4i32: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: movi.2d v0, #0000000000000000 +; CHECK-NEXT: str xzr, [x0, #16] ; CHECK-NEXT: uaddlv.4s d1, v0 ; CHECK-NEXT: mov.d v0[0], v1[0] -; CHECK-NEXT: movi.2d v1, #0000000000000000 ; CHECK-NEXT: ucvtf.2d v0, v0 -; CHECK-NEXT: str d1, [x0, #16] ; CHECK-NEXT: fcvtn v0.2s, v0.2d ; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/add.ll b/llvm/test/CodeGen/AArch64/add.ll index e3072dc41d933..fc0ba336b21cc 100644 --- a/llvm/test/CodeGen/AArch64/add.ll +++ b/llvm/test/CodeGen/AArch64/add.ll @@ -232,10 +232,9 @@ define void @v2i16(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8] ; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9] ; CHECK-SD-NEXT: add v0.2s, v0.2s, v1.2s -; CHECK-SD-NEXT: mov w8, v0.s[1] -; CHECK-SD-NEXT: fmov w9, s0 -; CHECK-SD-NEXT: strh w9, [x0] -; CHECK-SD-NEXT: strh w8, [x0, #2] +; CHECK-SD-NEXT: mov s1, v0.s[1] +; CHECK-SD-NEXT: str h0, [x0] +; CHECK-SD-NEXT: str h1, [x0, #2] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: v2i16: diff --git a/llvm/test/CodeGen/AArch64/andorxor.ll b/llvm/test/CodeGen/AArch64/andorxor.ll index 5c7429aebb31e..24f2549cce785 100644 --- a/llvm/test/CodeGen/AArch64/andorxor.ll +++ b/llvm/test/CodeGen/AArch64/andorxor.ll @@ -696,10 +696,9 @@ define void @and_v2i16(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8] ; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9] ; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-SD-NEXT: mov w8, v0.s[1] -; CHECK-SD-NEXT: fmov w9, s0 -; CHECK-SD-NEXT: strh w9, [x0] -; CHECK-SD-NEXT: strh w8, [x0, #2] +; CHECK-SD-NEXT: mov s1, v0.s[1] +; CHECK-SD-NEXT: str h0, [x0] +; CHECK-SD-NEXT: str h1, [x0, #2] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: and_v2i16: @@ -733,10 +732,9 @@ define void @or_v2i16(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8] ; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9] ; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b -; CHECK-SD-NEXT: mov w8, v0.s[1] -; CHECK-SD-NEXT: fmov w9, s0 -; CHECK-SD-NEXT: strh w9, [x0] -; CHECK-SD-NEXT: strh w8, [x0, #2] +; CHECK-SD-NEXT: mov s1, v0.s[1] +; CHECK-SD-NEXT: str h0, [x0] +; CHECK-SD-NEXT: str h1, [x0, #2] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: or_v2i16: @@ -770,10 +768,9 @@ define void @xor_v2i16(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8] ; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9] ; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b -; CHECK-SD-NEXT: mov w8, v0.s[1] -; CHECK-SD-NEXT: fmov w9, s0 -; CHECK-SD-NEXT: strh w9, [x0] -; CHECK-SD-NEXT: strh w8, [x0, #2] +; CHECK-SD-NEXT: mov s1, v0.s[1] +; CHECK-SD-NEXT: str h0, [x0] +; CHECK-SD-NEXT: str h1, [x0, #2] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: xor_v2i16: diff --git a/llvm/test/CodeGen/AArch64/bitcast.ll b/llvm/test/CodeGen/AArch64/bitcast.ll index 39f2572d9fd35..d9199ce2c79de 100644 --- a/llvm/test/CodeGen/AArch64/bitcast.ll +++ b/llvm/test/CodeGen/AArch64/bitcast.ll @@ -102,10 +102,9 @@ define i32 @bitcast_v2i16_i32(<2 x i16> %a, <2 x i16> %b){ ; CHECK-SD-NEXT: sub sp, sp, #16 ; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 ; CHECK-SD-NEXT: add v0.2s, v0.2s, v1.2s -; CHECK-SD-NEXT: mov w8, v0.s[1] -; CHECK-SD-NEXT: fmov w9, s0 -; CHECK-SD-NEXT: strh w9, [sp, #12] -; CHECK-SD-NEXT: strh w8, [sp, #14] +; CHECK-SD-NEXT: mov s1, v0.s[1] +; CHECK-SD-NEXT: str h0, [sp, #12] +; CHECK-SD-NEXT: str h1, [sp, #14] ; CHECK-SD-NEXT: ldr w0, [sp, #12] ; CHECK-SD-NEXT: add sp, sp, #16 ; CHECK-SD-NEXT: ret @@ -401,10 +400,9 @@ define <4 x i8> @bitcast_v2i16_v4i8(<2 x i16> %a, <2 x i16> %b){ ; CHECK-SD-NEXT: sub sp, sp, #16 ; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 ; CHECK-SD-NEXT: add v0.2s, v0.2s, v1.2s -; CHECK-SD-NEXT: mov w8, v0.s[1] -; CHECK-SD-NEXT: fmov w9, s0 -; CHECK-SD-NEXT: strh w9, [sp, #12] -; CHECK-SD-NEXT: strh w8, [sp, #14] +; CHECK-SD-NEXT: mov s1, v0.s[1] +; CHECK-SD-NEXT: str h0, [sp, #12] +; CHECK-SD-NEXT: str h1, [sp, #14] ; CHECK-SD-NEXT: ldr s0, [sp, #12] ; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 diff --git a/llvm/test/CodeGen/AArch64/mul.ll b/llvm/test/CodeGen/AArch64/mul.ll index 9ca975d9e742e..500379d1cfdec 100644 --- a/llvm/test/CodeGen/AArch64/mul.ll +++ b/llvm/test/CodeGen/AArch64/mul.ll @@ -244,10 +244,9 @@ define void @v2i16(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8] ; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9] ; CHECK-SD-NEXT: mul v0.2s, v0.2s, v1.2s -; CHECK-SD-NEXT: mov w8, v0.s[1] -; CHECK-SD-NEXT: fmov w9, s0 -; CHECK-SD-NEXT: strh w9, [x0] -; CHECK-SD-NEXT: strh w8, [x0, #2] +; CHECK-SD-NEXT: mov s1, v0.s[1] +; CHECK-SD-NEXT: str h0, [x0] +; CHECK-SD-NEXT: str h1, [x0, #2] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: v2i16: diff --git a/llvm/test/CodeGen/AArch64/neon-rshrn.ll b/llvm/test/CodeGen/AArch64/neon-rshrn.ll index e648b10ea357b..8fabd7a618f68 100644 --- a/llvm/test/CodeGen/AArch64/neon-rshrn.ll +++ b/llvm/test/CodeGen/AArch64/neon-rshrn.ll @@ -868,10 +868,9 @@ define void @rshrn_v2i32_4(<2 x i32> %a, ptr %p) { ; CHECK-NEXT: movi v1.2s, #8 ; CHECK-NEXT: add v0.2s, v0.2s, v1.2s ; CHECK-NEXT: ushr v0.2s, v0.2s, #4 -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: strh w9, [x0] -; CHECK-NEXT: strh w8, [x0, #2] +; CHECK-NEXT: mov s1, v0.s[1] +; CHECK-NEXT: str h0, [x0] +; CHECK-NEXT: str h1, [x0, #2] ; CHECK-NEXT: ret entry: %b = add <2 x i32> %a, diff --git a/llvm/test/CodeGen/AArch64/neon-truncstore.ll b/llvm/test/CodeGen/AArch64/neon-truncstore.ll index 5d78ad24eb333..3d3362d314a99 100644 --- a/llvm/test/CodeGen/AArch64/neon-truncstore.ll +++ b/llvm/test/CodeGen/AArch64/neon-truncstore.ll @@ -42,10 +42,9 @@ define void @v2i32_v2i16(<2 x i32> %a, ptr %result) { ; CHECK-LABEL: v2i32_v2i16: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: strh w9, [x0] -; CHECK-NEXT: strh w8, [x0, #2] +; CHECK-NEXT: mov s1, v0.s[1] +; CHECK-NEXT: str h0, [x0] +; CHECK-NEXT: str h1, [x0, #2] ; CHECK-NEXT: ret %b = trunc <2 x i32> %a to <2 x i16> store <2 x i16> %b, ptr %result diff --git a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll index 531562d3aa678..4d76994be204f 100644 --- a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll @@ -256,10 +256,9 @@ define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-SD-NEXT: shl v0.2s, v0.2s, #16 ; CHECK-SD-NEXT: sqadd v0.2s, v0.2s, v1.2s ; CHECK-SD-NEXT: ushr v0.2s, v0.2s, #16 -; CHECK-SD-NEXT: mov w8, v0.s[1] -; CHECK-SD-NEXT: fmov w9, s0 -; CHECK-SD-NEXT: strh w9, [x2] -; CHECK-SD-NEXT: strh w8, [x2, #2] +; CHECK-SD-NEXT: mov s1, v0.s[1] +; CHECK-SD-NEXT: str h0, [x2] +; CHECK-SD-NEXT: str h1, [x2, #2] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: v2i16: diff --git a/llvm/test/CodeGen/AArch64/shufflevector.ll b/llvm/test/CodeGen/AArch64/shufflevector.ll index 0f5b240e387ed..4c8f0c9c446f5 100644 --- a/llvm/test/CodeGen/AArch64/shufflevector.ll +++ b/llvm/test/CodeGen/AArch64/shufflevector.ll @@ -288,10 +288,9 @@ define i32 @shufflevector_v2i16(<2 x i16> %a, <2 x i16> %b){ ; CHECK-SD-NEXT: sub sp, sp, #16 ; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 ; CHECK-SD-NEXT: ext v0.8b, v0.8b, v1.8b, #4 -; CHECK-SD-NEXT: mov w8, v0.s[1] -; CHECK-SD-NEXT: fmov w9, s0 -; CHECK-SD-NEXT: strh w9, [sp, #12] -; CHECK-SD-NEXT: strh w8, [sp, #14] +; CHECK-SD-NEXT: mov s1, v0.s[1] +; CHECK-SD-NEXT: str h0, [sp, #12] +; CHECK-SD-NEXT: str h1, [sp, #14] ; CHECK-SD-NEXT: ldr w0, [sp, #12] ; CHECK-SD-NEXT: add sp, sp, #16 ; CHECK-SD-NEXT: ret @@ -499,10 +498,9 @@ define i32 @shufflevector_v2i16_zeroes(<2 x i16> %a, <2 x i16> %b){ ; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-SD-NEXT: dup v1.2s, v0.s[0] -; CHECK-SD-NEXT: fmov w9, s0 -; CHECK-SD-NEXT: strh w9, [sp, #12] -; CHECK-SD-NEXT: mov w8, v1.s[1] -; CHECK-SD-NEXT: strh w8, [sp, #14] +; CHECK-SD-NEXT: str h0, [sp, #12] +; CHECK-SD-NEXT: mov s1, v1.s[1] +; CHECK-SD-NEXT: str h1, [sp, #14] ; CHECK-SD-NEXT: ldr w0, [sp, #12] ; CHECK-SD-NEXT: add sp, sp, #16 ; CHECK-SD-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll index be4a5843e8215..ae2a16929e254 100644 --- a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll @@ -257,10 +257,9 @@ define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-SD-NEXT: shl v0.2s, v0.2s, #16 ; CHECK-SD-NEXT: sqsub v0.2s, v0.2s, v1.2s ; CHECK-SD-NEXT: ushr v0.2s, v0.2s, #16 -; CHECK-SD-NEXT: mov w8, v0.s[1] -; CHECK-SD-NEXT: fmov w9, s0 -; CHECK-SD-NEXT: strh w9, [x2] -; CHECK-SD-NEXT: strh w8, [x2, #2] +; CHECK-SD-NEXT: mov s1, v0.s[1] +; CHECK-SD-NEXT: str h0, [x2] +; CHECK-SD-NEXT: str h1, [x2, #2] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: v2i16: diff --git a/llvm/test/CodeGen/AArch64/store.ll b/llvm/test/CodeGen/AArch64/store.ll index 86d74b69f4958..37a6ad08d4cb3 100644 --- a/llvm/test/CodeGen/AArch64/store.ll +++ b/llvm/test/CodeGen/AArch64/store.ll @@ -147,22 +147,13 @@ define void @store_v32i8(<32 x i8> %a, ptr %ptr){ } define void @store_v2i16(<2 x i16> %a, ptr %ptr){ -; CHECK-SD-LABEL: store_v2i16: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-SD-NEXT: mov w8, v0.s[1] -; CHECK-SD-NEXT: fmov w9, s0 -; CHECK-SD-NEXT: strh w9, [x0] -; CHECK-SD-NEXT: strh w8, [x0, #2] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: store_v2i16: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: mov s1, v0.s[1] -; CHECK-GI-NEXT: str h0, [x0] -; CHECK-GI-NEXT: str h1, [x0, #2] -; CHECK-GI-NEXT: ret +; CHECK-LABEL: store_v2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov s1, v0.s[1] +; CHECK-NEXT: str h0, [x0] +; CHECK-NEXT: str h1, [x0, #2] +; CHECK-NEXT: ret store <2 x i16> %a, ptr %ptr ret void } diff --git a/llvm/test/CodeGen/AArch64/sub.ll b/llvm/test/CodeGen/AArch64/sub.ll index 8f35a69f52b85..8183a82f21cb5 100644 --- a/llvm/test/CodeGen/AArch64/sub.ll +++ b/llvm/test/CodeGen/AArch64/sub.ll @@ -232,10 +232,9 @@ define void @v2i16(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8] ; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9] ; CHECK-SD-NEXT: sub v0.2s, v0.2s, v1.2s -; CHECK-SD-NEXT: mov w8, v0.s[1] -; CHECK-SD-NEXT: fmov w9, s0 -; CHECK-SD-NEXT: strh w9, [x0] -; CHECK-SD-NEXT: strh w8, [x0, #2] +; CHECK-SD-NEXT: mov s1, v0.s[1] +; CHECK-SD-NEXT: str h0, [x0] +; CHECK-SD-NEXT: str h1, [x0, #2] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: v2i16: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll index 2c891251befc7..7d6336a43a4fd 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll @@ -269,27 +269,23 @@ define <4 x i256> @load_sext_v4i32i256(ptr %ap) { ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: sunpklo z0.d, z0.s ; CHECK-NEXT: fmov x9, d1 -; CHECK-NEXT: mov z2.d, z1.d[1] -; CHECK-NEXT: str d1, [x8] -; CHECK-NEXT: fmov x10, d0 -; CHECK-NEXT: asr x9, x9, #63 -; CHECK-NEXT: mov z1.d, z0.d[1] -; CHECK-NEXT: str d0, [x8, #64] -; CHECK-NEXT: stp x9, x9, [x8, #16] -; CHECK-NEXT: str x9, [x8, #8] -; CHECK-NEXT: asr x9, x10, #63 -; CHECK-NEXT: fmov x10, d2 -; CHECK-NEXT: str d2, [x8, #32] -; CHECK-NEXT: stp x9, x9, [x8, #80] -; CHECK-NEXT: str x9, [x8, #72] +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x11, d0 +; CHECK-NEXT: mov z0.d, z0.d[1] +; CHECK-NEXT: asr x10, x9, #63 +; CHECK-NEXT: stp x9, x10, [x8] ; CHECK-NEXT: fmov x9, d1 -; CHECK-NEXT: asr x10, x10, #63 -; CHECK-NEXT: str d1, [x8, #96] +; CHECK-NEXT: asr x12, x11, #63 +; CHECK-NEXT: stp x10, x10, [x8, #16] +; CHECK-NEXT: stp x11, x12, [x8, #64] +; CHECK-NEXT: fmov x11, d0 +; CHECK-NEXT: asr x10, x9, #63 +; CHECK-NEXT: stp x12, x12, [x8, #80] ; CHECK-NEXT: stp x10, x10, [x8, #48] -; CHECK-NEXT: asr x9, x9, #63 -; CHECK-NEXT: str x10, [x8, #40] -; CHECK-NEXT: stp x9, x9, [x8, #112] -; CHECK-NEXT: str x9, [x8, #104] +; CHECK-NEXT: asr x12, x11, #63 +; CHECK-NEXT: stp x9, x10, [x8, #32] +; CHECK-NEXT: stp x12, x12, [x8, #112] +; CHECK-NEXT: stp x11, x12, [x8, #96] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: load_sext_v4i32i256: diff --git a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll index 924bd3981779e..d0173307bd830 100644 --- a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll @@ -255,10 +255,9 @@ define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-SD-NEXT: mov v1.s[1], w11 ; CHECK-SD-NEXT: add v0.2s, v0.2s, v1.2s ; CHECK-SD-NEXT: umin v0.2s, v0.2s, v2.2s -; CHECK-SD-NEXT: mov w8, v0.s[1] -; CHECK-SD-NEXT: fmov w9, s0 -; CHECK-SD-NEXT: strh w9, [x2] -; CHECK-SD-NEXT: strh w8, [x2, #2] +; CHECK-SD-NEXT: mov s1, v0.s[1] +; CHECK-SD-NEXT: str h0, [x2] +; CHECK-SD-NEXT: str h1, [x2, #2] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: v2i16: diff --git a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll index a623eb554cac7..dc3ebfb0682ca 100644 --- a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll @@ -252,10 +252,9 @@ define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-SD-NEXT: mov v0.s[1], w10 ; CHECK-SD-NEXT: mov v1.s[1], w11 ; CHECK-SD-NEXT: uqsub v0.2s, v0.2s, v1.2s -; CHECK-SD-NEXT: mov w8, v0.s[1] -; CHECK-SD-NEXT: fmov w9, s0 -; CHECK-SD-NEXT: strh w9, [x2] -; CHECK-SD-NEXT: strh w8, [x2, #2] +; CHECK-SD-NEXT: mov s1, v0.s[1] +; CHECK-SD-NEXT: str h0, [x2] +; CHECK-SD-NEXT: str h1, [x2, #2] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: v2i16: From f64904cb78ec2f9c3ef5a15c2ff46bae490f8394 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Mon, 17 Mar 2025 17:47:20 +0000 Subject: [PATCH 10/11] Fixups --- .../Target/AArch64/AArch64ISelLowering.cpp | 20 +++++++++---------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 918a65b8132be..879b83f94b79a 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -24036,18 +24036,16 @@ static SDValue performSTORECombine(SDNode *N, return SDValue(); // Heuristic: If there are other users of integer scalars extracted from - // this vector that won't fold into the store -- abandon folding. This may - // extend the vector lifetime and disrupt paired stores. - for (auto Use = Vector->use_begin(), End = Vector->use_end(); Use != End; - ++Use) { - if (Use->getResNo() != Vector.getResNo()) + // this vector that won't fold into the store -- abandon folding. Applying + // this fold may extend the vector lifetime and disrupt paired stores. + for (const auto &Use : Vector->uses()) { + if (Use.getResNo() != Vector.getResNo()) continue; - SDNode *User = Use->getUser(); - if (User->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { - if (!User->hasOneUse() || - (*User->user_begin())->getOpcode() != ISD::STORE) - return SDValue(); - } + const SDNode *User = Use.getUser(); + if (User->getOpcode() == ISD::EXTRACT_VECTOR_ELT && + (!User->hasOneUse() || + (*User->user_begin())->getOpcode() != ISD::STORE)) + return SDValue(); } EVT FPElemVT = EVT::getFloatingPointVT(ElemVT.getSizeInBits()); From a6897836e206de07584447ca184cd32c3c41ece9 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Tue, 18 Mar 2025 09:53:01 +0000 Subject: [PATCH 11/11] Rebase: Update tests --- llvm/test/CodeGen/AArch64/ctlz.ll | 7 +++---- llvm/test/CodeGen/AArch64/ctpop.ll | 7 +++---- llvm/test/CodeGen/AArch64/cttz.ll | 7 +++---- 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/ctlz.ll b/llvm/test/CodeGen/AArch64/ctlz.ll index 437e3d5ff75c6..7b8f6cf24f278 100644 --- a/llvm/test/CodeGen/AArch64/ctlz.ll +++ b/llvm/test/CodeGen/AArch64/ctlz.ll @@ -302,10 +302,9 @@ define void @v2i16(ptr %p1) { ; CHECK-SD-NEXT: mov v1.s[1], w9 ; CHECK-SD-NEXT: clz v1.2s, v1.2s ; CHECK-SD-NEXT: sub v0.2s, v1.2s, v0.2s -; CHECK-SD-NEXT: mov w8, v0.s[1] -; CHECK-SD-NEXT: fmov w9, s0 -; CHECK-SD-NEXT: strh w9, [x0] -; CHECK-SD-NEXT: strh w8, [x0, #2] +; CHECK-SD-NEXT: mov s1, v0.s[1] +; CHECK-SD-NEXT: str h0, [x0] +; CHECK-SD-NEXT: str h1, [x0, #2] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: v2i16: diff --git a/llvm/test/CodeGen/AArch64/ctpop.ll b/llvm/test/CodeGen/AArch64/ctpop.ll index 785a447123b5e..2299b5c5a5af9 100644 --- a/llvm/test/CodeGen/AArch64/ctpop.ll +++ b/llvm/test/CodeGen/AArch64/ctpop.ll @@ -122,10 +122,9 @@ define void @v2i16(ptr %p1) { ; CHECK-NEXT: cnt v0.8b, v0.8b ; CHECK-NEXT: uaddlp v0.4h, v0.8b ; CHECK-NEXT: uaddlp v0.2s, v0.4h -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: strh w9, [x0] -; CHECK-NEXT: strh w8, [x0, #2] +; CHECK-NEXT: mov s1, v0.s[1] +; CHECK-NEXT: str h0, [x0] +; CHECK-NEXT: str h1, [x0, #2] ; CHECK-NEXT: ret entry: %d = load <2 x i16>, ptr %p1 diff --git a/llvm/test/CodeGen/AArch64/cttz.ll b/llvm/test/CodeGen/AArch64/cttz.ll index a254df229c127..9bc0970deeeda 100644 --- a/llvm/test/CodeGen/AArch64/cttz.ll +++ b/llvm/test/CodeGen/AArch64/cttz.ll @@ -164,10 +164,9 @@ define void @v2i16(ptr %p1) { ; CHECK-NEXT: movi v1.2s, #32 ; CHECK-NEXT: clz v0.2s, v0.2s ; CHECK-NEXT: sub v0.2s, v1.2s, v0.2s -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: strh w9, [x0] -; CHECK-NEXT: strh w8, [x0, #2] +; CHECK-NEXT: mov s1, v0.s[1] +; CHECK-NEXT: str h0, [x0] +; CHECK-NEXT: str h1, [x0, #2] ; CHECK-NEXT: ret entry: %d = load <2 x i16>, ptr %p1