From fe7e653e6306671a8d55bcee4be38adabf217ede Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Wed, 16 Oct 2024 17:26:48 +0100 Subject: [PATCH 1/3] Precommit tests --- llvm/test/Analysis/CostModel/RISCV/splice.ll | 26 +- llvm/test/CodeGen/RISCV/rvv/vector-splice.ll | 2769 ++++++++++++++++-- 2 files changed, 2614 insertions(+), 181 deletions(-) diff --git a/llvm/test/Analysis/CostModel/RISCV/splice.ll b/llvm/test/Analysis/CostModel/RISCV/splice.ll index 8d7d1576a532d..ddfaa8c13d425 100644 --- a/llvm/test/Analysis/CostModel/RISCV/splice.ll +++ b/llvm/test/Analysis/CostModel/RISCV/splice.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -passes="print" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfh | FileCheck %s -; RUN: opt < %s -passes="print" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfhmin | FileCheck %s +; RUN: opt < %s -passes="print" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfh,+zvfbfmin | FileCheck %s +; RUN: opt < %s -passes="print" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfhmin,+zvfbfmin | FileCheck %s ; RUN: opt < %s -passes="print" -cost-kind=code-size 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfh | FileCheck %s --check-prefix=SIZE ; RUN: opt < %s -passes="print" -cost-kind=code-size 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfhmin | FileCheck %s --check-prefix=SIZE @@ -34,6 +34,13 @@ define void @vector_splice() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %splice.nxv16i64 = call @llvm.vector.splice.nxv16i64( zeroinitializer, zeroinitializer, i32 -1) ; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %splice.nxv32i64 = call @llvm.vector.splice.nxv32i64( zeroinitializer, zeroinitializer, i32 -1) ; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %splice.nxv64i64 = call @llvm.vector.splice.nxv64i64( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1bf16 = call @llvm.vector.splice.nxv1bf16( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2bf16 = call @llvm.vector.splice.nxv2bf16( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4bf16 = call @llvm.vector.splice.nxv4bf16( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv8bf16 = call @llvm.vector.splice.nxv8bf16( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv16bf16 = call @llvm.vector.splice.nxv16bf16( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv32bf16 = call @llvm.vector.splice.nxv32bf16( zeroinitializer, zeroinitializer, i32 -1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %splice.nxv64bf16 = call @llvm.vector.splice.nxv64bf16( zeroinitializer, zeroinitializer, i32 -1) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1f16 = call @llvm.vector.splice.nxv1f16( zeroinitializer, zeroinitializer, i32 -1) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2f16 = call @llvm.vector.splice.nxv2f16( zeroinitializer, zeroinitializer, i32 -1) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4f16 = call @llvm.vector.splice.nxv4f16( zeroinitializer, zeroinitializer, i32 -1) @@ -86,6 +93,13 @@ define void @vector_splice() { ; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv16i64 = call @llvm.vector.splice.nxv16i64( zeroinitializer, zeroinitializer, i32 -1) ; SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv32i64 = call @llvm.vector.splice.nxv32i64( zeroinitializer, zeroinitializer, i32 -1) ; SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv64i64 = call @llvm.vector.splice.nxv64i64( zeroinitializer, zeroinitializer, i32 -1) +; SIZE-NEXT: Cost Model: Invalid cost for instruction: %splice.nxv1bf16 = call @llvm.vector.splice.nxv1bf16( zeroinitializer, zeroinitializer, i32 -1) +; SIZE-NEXT: Cost Model: Invalid cost for instruction: %splice.nxv2bf16 = call @llvm.vector.splice.nxv2bf16( zeroinitializer, zeroinitializer, i32 -1) +; SIZE-NEXT: Cost Model: Invalid cost for instruction: %splice.nxv4bf16 = call @llvm.vector.splice.nxv4bf16( zeroinitializer, zeroinitializer, i32 -1) +; SIZE-NEXT: Cost Model: Invalid cost for instruction: %splice.nxv8bf16 = call @llvm.vector.splice.nxv8bf16( zeroinitializer, zeroinitializer, i32 -1) +; SIZE-NEXT: Cost Model: Invalid cost for instruction: %splice.nxv16bf16 = call @llvm.vector.splice.nxv16bf16( zeroinitializer, zeroinitializer, i32 -1) +; SIZE-NEXT: Cost Model: Invalid cost for instruction: %splice.nxv32bf16 = call @llvm.vector.splice.nxv32bf16( zeroinitializer, zeroinitializer, i32 -1) +; SIZE-NEXT: Cost Model: Invalid cost for instruction: %splice.nxv64bf16 = call @llvm.vector.splice.nxv64bf16( zeroinitializer, zeroinitializer, i32 -1) ; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1f16 = call @llvm.vector.splice.nxv1f16( zeroinitializer, zeroinitializer, i32 -1) ; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2f16 = call @llvm.vector.splice.nxv2f16( zeroinitializer, zeroinitializer, i32 -1) ; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4f16 = call @llvm.vector.splice.nxv4f16( zeroinitializer, zeroinitializer, i32 -1) @@ -141,6 +155,14 @@ define void @vector_splice() { %splice.nxv32i64 = call @llvm.vector.splice.nxv32i64( zeroinitializer, zeroinitializer, i32 -1) %splice.nxv64i64 = call @llvm.vector.splice.nxv64i64( zeroinitializer, zeroinitializer, i32 -1) + %splice.nxv1bf16 = call @llvm.vector.splice.nxv1bf16( zeroinitializer, zeroinitializer, i32 -1) + %splice.nxv2bf16 = call @llvm.vector.splice.nxv2bf16( zeroinitializer, zeroinitializer, i32 -1) + %splice.nxv4bf16 = call @llvm.vector.splice.nxv4bf16( zeroinitializer, zeroinitializer, i32 -1) + %splice.nxv8bf16 = call @llvm.vector.splice.nxv8bf16( zeroinitializer, zeroinitializer, i32 -1) + %splice.nxv16bf16 = call @llvm.vector.splice.nxv16bf16( zeroinitializer, zeroinitializer, i32 -1) + %splice.nxv32bf16 = call @llvm.vector.splice.nxv32bf16( zeroinitializer, zeroinitializer, i32 -1) + %splice.nxv64bf16 = call @llvm.vector.splice.nxv64bf16( zeroinitializer, zeroinitializer, i32 -1) + %splice.nxv1f16 = call @llvm.vector.splice.nxv1f16( zeroinitializer, zeroinitializer, i32 -1) %splice.nxv2f16 = call @llvm.vector.splice.nxv2f16( zeroinitializer, zeroinitializer, i32 -1) %splice.nxv4f16 = call @llvm.vector.splice.nxv4f16( zeroinitializer, zeroinitializer, i32 -1) diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll b/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll index 8cb6fed2f588a..3f84f4549ce81 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple riscv32 -mattr=+m,+f,+d,+v,+zfh,+zvfh < %s | FileCheck %s -; RUN: llc -mtriple riscv64 -mattr=+m,+f,+d,+v,+zfh,+zvfh < %s | FileCheck %s +; RUN: llc -mtriple riscv32 -mattr=+m,+f,+d,+v,+zvfhmin,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN64 +; RUN: llc -mtriple riscv64 -mattr=+m,+f,+d,+v,+zvfhmin,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN32 +; RUN: llc -mtriple riscv32 -mattr=+m,+f,+d,+v,+zvfh,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFH32 +; RUN: llc -mtriple riscv64 -mattr=+m,+f,+d,+v,+zvfh,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFH64 ; Tests assume VLEN=128 or vscale_range_min=2. @@ -1533,6 +1535,1220 @@ define @splice_nxv8i64_offset_max( %a, %res } +declare @llvm.vector.splice.nxv1bf16(, , i32) + +define @splice_nxv1bf16_offset_zero( %a, %b) #0 { +; CHECK-LABEL: splice_nxv1bf16_offset_zero: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv1bf16( %a, %b, i32 0) + ret %res +} + +define @splice_nxv1bf16_offset_negone( %a, %b) #0 { +; CHECK-LABEL: splice_nxv1bf16_offset_negone: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a1, a1, 2 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vse16.v v9, (a0) +; CHECK-NEXT: addi a0, a0, -2 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv1bf16( %a, %b, i32 -1) + ret %res +} + +define @splice_nxv1bf16_offset_min( %a, %b) #0 { +; CHECK-LABEL: splice_nxv1bf16_offset_min: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a1, a1, 2 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: li a2, 4 +; CHECK-NEXT: vse16.v v9, (a0) +; CHECK-NEXT: bltu a1, a2, .LBB104_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: li a1, 4 +; CHECK-NEXT: .LBB104_2: +; CHECK-NEXT: sub a0, a0, a1 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv1bf16( %a, %b, i32 -2) + ret %res +} + +define @splice_nxv1bf16_offset_max( %a, %b) #0 { +; CHECK-LABEL: splice_nxv1bf16_offset_max: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a2, a1, 2 +; CHECK-NEXT: add a2, a0, a2 +; CHECK-NEXT: srli a1, a1, 3 +; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: li a3, 1 +; CHECK-NEXT: vse16.v v9, (a2) +; CHECK-NEXT: bltu a1, a3, .LBB105_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: li a1, 1 +; CHECK-NEXT: .LBB105_2: +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv1bf16( %a, %b, i32 1) + ret %res +} + +declare @llvm.vector.splice.nxv2bf16(, , i32) + +define @splice_nxv2bf16_offset_zero( %a, %b) #0 { +; CHECK-LABEL: splice_nxv2bf16_offset_zero: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv2bf16( %a, %b, i32 0) + ret %res +} + +define @splice_nxv2bf16_offset_negone( %a, %b) #0 { +; CHECK-LABEL: splice_nxv2bf16_offset_negone: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a1, a1, 1 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vse16.v v9, (a0) +; CHECK-NEXT: addi a0, a0, -2 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv2bf16( %a, %b, i32 -1) + ret %res +} + +define @splice_nxv2bf16_offset_min( %a, %b) #0 { +; CHECK-LABEL: splice_nxv2bf16_offset_min: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a1, a1, 1 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: li a2, 8 +; CHECK-NEXT: vse16.v v9, (a0) +; CHECK-NEXT: bltu a1, a2, .LBB108_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: li a1, 8 +; CHECK-NEXT: .LBB108_2: +; CHECK-NEXT: sub a0, a0, a1 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv2bf16( %a, %b, i32 -4) + ret %res +} + +define @splice_nxv2bf16_offset_max( %a, %b) #0 { +; CHECK-LABEL: splice_nxv2bf16_offset_max: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a2, a1, 1 +; CHECK-NEXT: add a2, a0, a2 +; CHECK-NEXT: srli a1, a1, 2 +; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: li a3, 3 +; CHECK-NEXT: vse16.v v9, (a2) +; CHECK-NEXT: bltu a1, a3, .LBB109_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: li a1, 3 +; CHECK-NEXT: .LBB109_2: +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv2bf16( %a, %b, i32 3) + ret %res +} + +declare @llvm.vector.splice.nxv4bf16(, , i32) + +define @splice_nxv4bf16_offset_zero( %a, %b) #0 { +; CHECK-LABEL: splice_nxv4bf16_offset_zero: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv4bf16( %a, %b, i32 0) + ret %res +} + +define @splice_nxv4bf16_offset_negone( %a, %b) #0 { +; CHECK-LABEL: splice_nxv4bf16_offset_negone: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs1r.v v8, (a0) +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vs1r.v v9, (a0) +; CHECK-NEXT: addi a0, a0, -2 +; CHECK-NEXT: vl1re16.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv4bf16( %a, %b, i32 -1) + ret %res +} + +define @splice_nxv4bf16_offset_min( %a, %b) #0 { +; CHECK-LABEL: splice_nxv4bf16_offset_min: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs1r.v v8, (a0) +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: li a2, 16 +; CHECK-NEXT: vs1r.v v9, (a0) +; CHECK-NEXT: bltu a1, a2, .LBB112_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: li a1, 16 +; CHECK-NEXT: .LBB112_2: +; CHECK-NEXT: sub a0, a0, a1 +; CHECK-NEXT: vl1re16.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv4bf16( %a, %b, i32 -8) + ret %res +} + +define @splice_nxv4bf16_offset_max( %a, %b) #0 { +; CHECK-LABEL: splice_nxv4bf16_offset_max: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs1r.v v8, (a0) +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: add a2, a0, a1 +; CHECK-NEXT: srli a1, a1, 1 +; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: li a3, 7 +; CHECK-NEXT: vs1r.v v9, (a2) +; CHECK-NEXT: bltu a1, a3, .LBB113_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: li a1, 7 +; CHECK-NEXT: .LBB113_2: +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vl1re16.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv4bf16( %a, %b, i32 7) + ret %res +} + +declare @llvm.vector.splice.nxv8bf16(, , i32) + +define @splice_nxv8bf16_offset_zero( %a, %b) #0 { +; CHECK-LABEL: splice_nxv8bf16_offset_zero: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv8bf16( %a, %b, i32 0) + ret %res +} + +define @splice_nxv8bf16_offset_negone( %a, %b) #0 { +; CHECK-LABEL: splice_nxv8bf16_offset_negone: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs2r.v v8, (a0) +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vs2r.v v10, (a0) +; CHECK-NEXT: addi a0, a0, -2 +; CHECK-NEXT: vl2re16.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv8bf16( %a, %b, i32 -1) + ret %res +} + +define @splice_nxv8bf16_offset_min( %a, %b) #0 { +; CHECK-LABEL: splice_nxv8bf16_offset_min: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs2r.v v8, (a0) +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: li a2, 32 +; CHECK-NEXT: vs2r.v v10, (a0) +; CHECK-NEXT: bltu a1, a2, .LBB116_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: .LBB116_2: +; CHECK-NEXT: sub a0, a0, a1 +; CHECK-NEXT: vl2re16.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv8bf16( %a, %b, i32 -16) + ret %res +} + +define @splice_nxv8bf16_offset_max( %a, %b) #0 { +; CHECK-LABEL: splice_nxv8bf16_offset_max: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs2r.v v8, (a0) +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a2, a1, 1 +; CHECK-NEXT: add a2, a0, a2 +; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: li a3, 15 +; CHECK-NEXT: vs2r.v v10, (a2) +; CHECK-NEXT: bltu a1, a3, .LBB117_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: li a1, 15 +; CHECK-NEXT: .LBB117_2: +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vl2re16.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv8bf16( %a, %b, i32 15) + ret %res +} + +declare @llvm.vector.splice.nxv16bf16(, , i32) + +define @splice_nxv16bf16_offset_zero( %a, %b) #0 { +; CHECK-LABEL: splice_nxv16bf16_offset_zero: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv16bf16( %a, %b, i32 0) + ret %res +} + +define @splice_nxv16bf16_offset_negone( %a, %b) #0 { +; ZVFHMIN64-LABEL: splice_nxv16bf16_offset_negone: +; ZVFHMIN64: # %bb.0: +; ZVFHMIN64-NEXT: addi sp, sp, -48 +; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 48 +; ZVFHMIN64-NEXT: sw ra, 44(sp) # 4-byte Folded Spill +; ZVFHMIN64-NEXT: sw s0, 40(sp) # 4-byte Folded Spill +; ZVFHMIN64-NEXT: .cfi_offset ra, -4 +; ZVFHMIN64-NEXT: .cfi_offset s0, -8 +; ZVFHMIN64-NEXT: addi s0, sp, 48 +; ZVFHMIN64-NEXT: .cfi_def_cfa s0, 0 +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: slli a0, a0, 3 +; ZVFHMIN64-NEXT: sub sp, sp, a0 +; ZVFHMIN64-NEXT: andi sp, sp, -32 +; ZVFHMIN64-NEXT: addi a0, sp, 32 +; ZVFHMIN64-NEXT: vs4r.v v8, (a0) +; ZVFHMIN64-NEXT: csrr a1, vlenb +; ZVFHMIN64-NEXT: slli a1, a1, 2 +; ZVFHMIN64-NEXT: add a0, a0, a1 +; ZVFHMIN64-NEXT: vs4r.v v12, (a0) +; ZVFHMIN64-NEXT: addi a0, a0, -2 +; ZVFHMIN64-NEXT: vl4re16.v v8, (a0) +; ZVFHMIN64-NEXT: addi sp, s0, -48 +; ZVFHMIN64-NEXT: lw ra, 44(sp) # 4-byte Folded Reload +; ZVFHMIN64-NEXT: lw s0, 40(sp) # 4-byte Folded Reload +; ZVFHMIN64-NEXT: addi sp, sp, 48 +; ZVFHMIN64-NEXT: ret +; +; ZVFHMIN32-LABEL: splice_nxv16bf16_offset_negone: +; ZVFHMIN32: # %bb.0: +; ZVFHMIN32-NEXT: addi sp, sp, -48 +; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 48 +; ZVFHMIN32-NEXT: sd ra, 40(sp) # 8-byte Folded Spill +; ZVFHMIN32-NEXT: sd s0, 32(sp) # 8-byte Folded Spill +; ZVFHMIN32-NEXT: .cfi_offset ra, -8 +; ZVFHMIN32-NEXT: .cfi_offset s0, -16 +; ZVFHMIN32-NEXT: addi s0, sp, 48 +; ZVFHMIN32-NEXT: .cfi_def_cfa s0, 0 +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: slli a0, a0, 3 +; ZVFHMIN32-NEXT: sub sp, sp, a0 +; ZVFHMIN32-NEXT: andi sp, sp, -32 +; ZVFHMIN32-NEXT: addi a0, sp, 32 +; ZVFHMIN32-NEXT: vs4r.v v8, (a0) +; ZVFHMIN32-NEXT: csrr a1, vlenb +; ZVFHMIN32-NEXT: slli a1, a1, 2 +; ZVFHMIN32-NEXT: add a0, a0, a1 +; ZVFHMIN32-NEXT: vs4r.v v12, (a0) +; ZVFHMIN32-NEXT: addi a0, a0, -2 +; ZVFHMIN32-NEXT: vl4re16.v v8, (a0) +; ZVFHMIN32-NEXT: addi sp, s0, -48 +; ZVFHMIN32-NEXT: ld ra, 40(sp) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: ld s0, 32(sp) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: addi sp, sp, 48 +; ZVFHMIN32-NEXT: ret +; +; ZVFH32-LABEL: splice_nxv16bf16_offset_negone: +; ZVFH32: # %bb.0: +; ZVFH32-NEXT: addi sp, sp, -48 +; ZVFH32-NEXT: .cfi_def_cfa_offset 48 +; ZVFH32-NEXT: sw ra, 44(sp) # 4-byte Folded Spill +; ZVFH32-NEXT: sw s0, 40(sp) # 4-byte Folded Spill +; ZVFH32-NEXT: .cfi_offset ra, -4 +; ZVFH32-NEXT: .cfi_offset s0, -8 +; ZVFH32-NEXT: addi s0, sp, 48 +; ZVFH32-NEXT: .cfi_def_cfa s0, 0 +; ZVFH32-NEXT: csrr a0, vlenb +; ZVFH32-NEXT: slli a0, a0, 3 +; ZVFH32-NEXT: sub sp, sp, a0 +; ZVFH32-NEXT: andi sp, sp, -32 +; ZVFH32-NEXT: addi a0, sp, 32 +; ZVFH32-NEXT: vs4r.v v8, (a0) +; ZVFH32-NEXT: csrr a1, vlenb +; ZVFH32-NEXT: slli a1, a1, 2 +; ZVFH32-NEXT: add a0, a0, a1 +; ZVFH32-NEXT: vs4r.v v12, (a0) +; ZVFH32-NEXT: addi a0, a0, -2 +; ZVFH32-NEXT: vl4re16.v v8, (a0) +; ZVFH32-NEXT: addi sp, s0, -48 +; ZVFH32-NEXT: lw ra, 44(sp) # 4-byte Folded Reload +; ZVFH32-NEXT: lw s0, 40(sp) # 4-byte Folded Reload +; ZVFH32-NEXT: addi sp, sp, 48 +; ZVFH32-NEXT: ret +; +; ZVFH64-LABEL: splice_nxv16bf16_offset_negone: +; ZVFH64: # %bb.0: +; ZVFH64-NEXT: addi sp, sp, -48 +; ZVFH64-NEXT: .cfi_def_cfa_offset 48 +; ZVFH64-NEXT: sd ra, 40(sp) # 8-byte Folded Spill +; ZVFH64-NEXT: sd s0, 32(sp) # 8-byte Folded Spill +; ZVFH64-NEXT: .cfi_offset ra, -8 +; ZVFH64-NEXT: .cfi_offset s0, -16 +; ZVFH64-NEXT: addi s0, sp, 48 +; ZVFH64-NEXT: .cfi_def_cfa s0, 0 +; ZVFH64-NEXT: csrr a0, vlenb +; ZVFH64-NEXT: slli a0, a0, 3 +; ZVFH64-NEXT: sub sp, sp, a0 +; ZVFH64-NEXT: andi sp, sp, -32 +; ZVFH64-NEXT: addi a0, sp, 32 +; ZVFH64-NEXT: vs4r.v v8, (a0) +; ZVFH64-NEXT: csrr a1, vlenb +; ZVFH64-NEXT: slli a1, a1, 2 +; ZVFH64-NEXT: add a0, a0, a1 +; ZVFH64-NEXT: vs4r.v v12, (a0) +; ZVFH64-NEXT: addi a0, a0, -2 +; ZVFH64-NEXT: vl4re16.v v8, (a0) +; ZVFH64-NEXT: addi sp, s0, -48 +; ZVFH64-NEXT: ld ra, 40(sp) # 8-byte Folded Reload +; ZVFH64-NEXT: ld s0, 32(sp) # 8-byte Folded Reload +; ZVFH64-NEXT: addi sp, sp, 48 +; ZVFH64-NEXT: ret + %res = call @llvm.vector.splice.nxv16bf16( %a, %b, i32 -1) + ret %res +} + +define @splice_nxv16bf16_offset_min( %a, %b) #0 { +; ZVFHMIN64-LABEL: splice_nxv16bf16_offset_min: +; ZVFHMIN64: # %bb.0: +; ZVFHMIN64-NEXT: addi sp, sp, -48 +; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 48 +; ZVFHMIN64-NEXT: sw ra, 44(sp) # 4-byte Folded Spill +; ZVFHMIN64-NEXT: sw s0, 40(sp) # 4-byte Folded Spill +; ZVFHMIN64-NEXT: .cfi_offset ra, -4 +; ZVFHMIN64-NEXT: .cfi_offset s0, -8 +; ZVFHMIN64-NEXT: addi s0, sp, 48 +; ZVFHMIN64-NEXT: .cfi_def_cfa s0, 0 +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: slli a0, a0, 3 +; ZVFHMIN64-NEXT: sub sp, sp, a0 +; ZVFHMIN64-NEXT: andi sp, sp, -32 +; ZVFHMIN64-NEXT: addi a0, sp, 32 +; ZVFHMIN64-NEXT: vs4r.v v8, (a0) +; ZVFHMIN64-NEXT: csrr a1, vlenb +; ZVFHMIN64-NEXT: slli a1, a1, 2 +; ZVFHMIN64-NEXT: add a0, a0, a1 +; ZVFHMIN64-NEXT: li a2, 64 +; ZVFHMIN64-NEXT: vs4r.v v12, (a0) +; ZVFHMIN64-NEXT: bltu a1, a2, .LBB120_2 +; ZVFHMIN64-NEXT: # %bb.1: +; ZVFHMIN64-NEXT: li a1, 64 +; ZVFHMIN64-NEXT: .LBB120_2: +; ZVFHMIN64-NEXT: sub a0, a0, a1 +; ZVFHMIN64-NEXT: vl4re16.v v8, (a0) +; ZVFHMIN64-NEXT: addi sp, s0, -48 +; ZVFHMIN64-NEXT: lw ra, 44(sp) # 4-byte Folded Reload +; ZVFHMIN64-NEXT: lw s0, 40(sp) # 4-byte Folded Reload +; ZVFHMIN64-NEXT: addi sp, sp, 48 +; ZVFHMIN64-NEXT: ret +; +; ZVFHMIN32-LABEL: splice_nxv16bf16_offset_min: +; ZVFHMIN32: # %bb.0: +; ZVFHMIN32-NEXT: addi sp, sp, -48 +; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 48 +; ZVFHMIN32-NEXT: sd ra, 40(sp) # 8-byte Folded Spill +; ZVFHMIN32-NEXT: sd s0, 32(sp) # 8-byte Folded Spill +; ZVFHMIN32-NEXT: .cfi_offset ra, -8 +; ZVFHMIN32-NEXT: .cfi_offset s0, -16 +; ZVFHMIN32-NEXT: addi s0, sp, 48 +; ZVFHMIN32-NEXT: .cfi_def_cfa s0, 0 +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: slli a0, a0, 3 +; ZVFHMIN32-NEXT: sub sp, sp, a0 +; ZVFHMIN32-NEXT: andi sp, sp, -32 +; ZVFHMIN32-NEXT: addi a0, sp, 32 +; ZVFHMIN32-NEXT: vs4r.v v8, (a0) +; ZVFHMIN32-NEXT: csrr a1, vlenb +; ZVFHMIN32-NEXT: slli a1, a1, 2 +; ZVFHMIN32-NEXT: add a0, a0, a1 +; ZVFHMIN32-NEXT: li a2, 64 +; ZVFHMIN32-NEXT: vs4r.v v12, (a0) +; ZVFHMIN32-NEXT: bltu a1, a2, .LBB120_2 +; ZVFHMIN32-NEXT: # %bb.1: +; ZVFHMIN32-NEXT: li a1, 64 +; ZVFHMIN32-NEXT: .LBB120_2: +; ZVFHMIN32-NEXT: sub a0, a0, a1 +; ZVFHMIN32-NEXT: vl4re16.v v8, (a0) +; ZVFHMIN32-NEXT: addi sp, s0, -48 +; ZVFHMIN32-NEXT: ld ra, 40(sp) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: ld s0, 32(sp) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: addi sp, sp, 48 +; ZVFHMIN32-NEXT: ret +; +; ZVFH32-LABEL: splice_nxv16bf16_offset_min: +; ZVFH32: # %bb.0: +; ZVFH32-NEXT: addi sp, sp, -48 +; ZVFH32-NEXT: .cfi_def_cfa_offset 48 +; ZVFH32-NEXT: sw ra, 44(sp) # 4-byte Folded Spill +; ZVFH32-NEXT: sw s0, 40(sp) # 4-byte Folded Spill +; ZVFH32-NEXT: .cfi_offset ra, -4 +; ZVFH32-NEXT: .cfi_offset s0, -8 +; ZVFH32-NEXT: addi s0, sp, 48 +; ZVFH32-NEXT: .cfi_def_cfa s0, 0 +; ZVFH32-NEXT: csrr a0, vlenb +; ZVFH32-NEXT: slli a0, a0, 3 +; ZVFH32-NEXT: sub sp, sp, a0 +; ZVFH32-NEXT: andi sp, sp, -32 +; ZVFH32-NEXT: addi a0, sp, 32 +; ZVFH32-NEXT: vs4r.v v8, (a0) +; ZVFH32-NEXT: csrr a1, vlenb +; ZVFH32-NEXT: slli a1, a1, 2 +; ZVFH32-NEXT: add a0, a0, a1 +; ZVFH32-NEXT: li a2, 64 +; ZVFH32-NEXT: vs4r.v v12, (a0) +; ZVFH32-NEXT: bltu a1, a2, .LBB120_2 +; ZVFH32-NEXT: # %bb.1: +; ZVFH32-NEXT: li a1, 64 +; ZVFH32-NEXT: .LBB120_2: +; ZVFH32-NEXT: sub a0, a0, a1 +; ZVFH32-NEXT: vl4re16.v v8, (a0) +; ZVFH32-NEXT: addi sp, s0, -48 +; ZVFH32-NEXT: lw ra, 44(sp) # 4-byte Folded Reload +; ZVFH32-NEXT: lw s0, 40(sp) # 4-byte Folded Reload +; ZVFH32-NEXT: addi sp, sp, 48 +; ZVFH32-NEXT: ret +; +; ZVFH64-LABEL: splice_nxv16bf16_offset_min: +; ZVFH64: # %bb.0: +; ZVFH64-NEXT: addi sp, sp, -48 +; ZVFH64-NEXT: .cfi_def_cfa_offset 48 +; ZVFH64-NEXT: sd ra, 40(sp) # 8-byte Folded Spill +; ZVFH64-NEXT: sd s0, 32(sp) # 8-byte Folded Spill +; ZVFH64-NEXT: .cfi_offset ra, -8 +; ZVFH64-NEXT: .cfi_offset s0, -16 +; ZVFH64-NEXT: addi s0, sp, 48 +; ZVFH64-NEXT: .cfi_def_cfa s0, 0 +; ZVFH64-NEXT: csrr a0, vlenb +; ZVFH64-NEXT: slli a0, a0, 3 +; ZVFH64-NEXT: sub sp, sp, a0 +; ZVFH64-NEXT: andi sp, sp, -32 +; ZVFH64-NEXT: addi a0, sp, 32 +; ZVFH64-NEXT: vs4r.v v8, (a0) +; ZVFH64-NEXT: csrr a1, vlenb +; ZVFH64-NEXT: slli a1, a1, 2 +; ZVFH64-NEXT: add a0, a0, a1 +; ZVFH64-NEXT: li a2, 64 +; ZVFH64-NEXT: vs4r.v v12, (a0) +; ZVFH64-NEXT: bltu a1, a2, .LBB120_2 +; ZVFH64-NEXT: # %bb.1: +; ZVFH64-NEXT: li a1, 64 +; ZVFH64-NEXT: .LBB120_2: +; ZVFH64-NEXT: sub a0, a0, a1 +; ZVFH64-NEXT: vl4re16.v v8, (a0) +; ZVFH64-NEXT: addi sp, s0, -48 +; ZVFH64-NEXT: ld ra, 40(sp) # 8-byte Folded Reload +; ZVFH64-NEXT: ld s0, 32(sp) # 8-byte Folded Reload +; ZVFH64-NEXT: addi sp, sp, 48 +; ZVFH64-NEXT: ret + %res = call @llvm.vector.splice.nxv16bf16( %a, %b, i32 -32) + ret %res +} + +define @splice_nxv16bf16_offset_max( %a, %b) #0 { +; ZVFHMIN64-LABEL: splice_nxv16bf16_offset_max: +; ZVFHMIN64: # %bb.0: +; ZVFHMIN64-NEXT: addi sp, sp, -48 +; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 48 +; ZVFHMIN64-NEXT: sw ra, 44(sp) # 4-byte Folded Spill +; ZVFHMIN64-NEXT: sw s0, 40(sp) # 4-byte Folded Spill +; ZVFHMIN64-NEXT: .cfi_offset ra, -4 +; ZVFHMIN64-NEXT: .cfi_offset s0, -8 +; ZVFHMIN64-NEXT: addi s0, sp, 48 +; ZVFHMIN64-NEXT: .cfi_def_cfa s0, 0 +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: slli a0, a0, 3 +; ZVFHMIN64-NEXT: sub sp, sp, a0 +; ZVFHMIN64-NEXT: andi sp, sp, -32 +; ZVFHMIN64-NEXT: addi a0, sp, 32 +; ZVFHMIN64-NEXT: vs4r.v v8, (a0) +; ZVFHMIN64-NEXT: csrr a1, vlenb +; ZVFHMIN64-NEXT: slli a2, a1, 2 +; ZVFHMIN64-NEXT: add a2, a0, a2 +; ZVFHMIN64-NEXT: slli a1, a1, 1 +; ZVFHMIN64-NEXT: addi a1, a1, -1 +; ZVFHMIN64-NEXT: li a3, 31 +; ZVFHMIN64-NEXT: vs4r.v v12, (a2) +; ZVFHMIN64-NEXT: bltu a1, a3, .LBB121_2 +; ZVFHMIN64-NEXT: # %bb.1: +; ZVFHMIN64-NEXT: li a1, 31 +; ZVFHMIN64-NEXT: .LBB121_2: +; ZVFHMIN64-NEXT: slli a1, a1, 1 +; ZVFHMIN64-NEXT: add a0, a0, a1 +; ZVFHMIN64-NEXT: vl4re16.v v8, (a0) +; ZVFHMIN64-NEXT: addi sp, s0, -48 +; ZVFHMIN64-NEXT: lw ra, 44(sp) # 4-byte Folded Reload +; ZVFHMIN64-NEXT: lw s0, 40(sp) # 4-byte Folded Reload +; ZVFHMIN64-NEXT: addi sp, sp, 48 +; ZVFHMIN64-NEXT: ret +; +; ZVFHMIN32-LABEL: splice_nxv16bf16_offset_max: +; ZVFHMIN32: # %bb.0: +; ZVFHMIN32-NEXT: addi sp, sp, -48 +; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 48 +; ZVFHMIN32-NEXT: sd ra, 40(sp) # 8-byte Folded Spill +; ZVFHMIN32-NEXT: sd s0, 32(sp) # 8-byte Folded Spill +; ZVFHMIN32-NEXT: .cfi_offset ra, -8 +; ZVFHMIN32-NEXT: .cfi_offset s0, -16 +; ZVFHMIN32-NEXT: addi s0, sp, 48 +; ZVFHMIN32-NEXT: .cfi_def_cfa s0, 0 +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: slli a0, a0, 3 +; ZVFHMIN32-NEXT: sub sp, sp, a0 +; ZVFHMIN32-NEXT: andi sp, sp, -32 +; ZVFHMIN32-NEXT: addi a0, sp, 32 +; ZVFHMIN32-NEXT: vs4r.v v8, (a0) +; ZVFHMIN32-NEXT: csrr a1, vlenb +; ZVFHMIN32-NEXT: slli a2, a1, 2 +; ZVFHMIN32-NEXT: add a2, a0, a2 +; ZVFHMIN32-NEXT: slli a1, a1, 1 +; ZVFHMIN32-NEXT: addi a1, a1, -1 +; ZVFHMIN32-NEXT: li a3, 31 +; ZVFHMIN32-NEXT: vs4r.v v12, (a2) +; ZVFHMIN32-NEXT: bltu a1, a3, .LBB121_2 +; ZVFHMIN32-NEXT: # %bb.1: +; ZVFHMIN32-NEXT: li a1, 31 +; ZVFHMIN32-NEXT: .LBB121_2: +; ZVFHMIN32-NEXT: slli a1, a1, 1 +; ZVFHMIN32-NEXT: add a0, a0, a1 +; ZVFHMIN32-NEXT: vl4re16.v v8, (a0) +; ZVFHMIN32-NEXT: addi sp, s0, -48 +; ZVFHMIN32-NEXT: ld ra, 40(sp) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: ld s0, 32(sp) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: addi sp, sp, 48 +; ZVFHMIN32-NEXT: ret +; +; ZVFH32-LABEL: splice_nxv16bf16_offset_max: +; ZVFH32: # %bb.0: +; ZVFH32-NEXT: addi sp, sp, -48 +; ZVFH32-NEXT: .cfi_def_cfa_offset 48 +; ZVFH32-NEXT: sw ra, 44(sp) # 4-byte Folded Spill +; ZVFH32-NEXT: sw s0, 40(sp) # 4-byte Folded Spill +; ZVFH32-NEXT: .cfi_offset ra, -4 +; ZVFH32-NEXT: .cfi_offset s0, -8 +; ZVFH32-NEXT: addi s0, sp, 48 +; ZVFH32-NEXT: .cfi_def_cfa s0, 0 +; ZVFH32-NEXT: csrr a0, vlenb +; ZVFH32-NEXT: slli a0, a0, 3 +; ZVFH32-NEXT: sub sp, sp, a0 +; ZVFH32-NEXT: andi sp, sp, -32 +; ZVFH32-NEXT: addi a0, sp, 32 +; ZVFH32-NEXT: vs4r.v v8, (a0) +; ZVFH32-NEXT: csrr a1, vlenb +; ZVFH32-NEXT: slli a2, a1, 2 +; ZVFH32-NEXT: add a2, a0, a2 +; ZVFH32-NEXT: slli a1, a1, 1 +; ZVFH32-NEXT: addi a1, a1, -1 +; ZVFH32-NEXT: li a3, 31 +; ZVFH32-NEXT: vs4r.v v12, (a2) +; ZVFH32-NEXT: bltu a1, a3, .LBB121_2 +; ZVFH32-NEXT: # %bb.1: +; ZVFH32-NEXT: li a1, 31 +; ZVFH32-NEXT: .LBB121_2: +; ZVFH32-NEXT: slli a1, a1, 1 +; ZVFH32-NEXT: add a0, a0, a1 +; ZVFH32-NEXT: vl4re16.v v8, (a0) +; ZVFH32-NEXT: addi sp, s0, -48 +; ZVFH32-NEXT: lw ra, 44(sp) # 4-byte Folded Reload +; ZVFH32-NEXT: lw s0, 40(sp) # 4-byte Folded Reload +; ZVFH32-NEXT: addi sp, sp, 48 +; ZVFH32-NEXT: ret +; +; ZVFH64-LABEL: splice_nxv16bf16_offset_max: +; ZVFH64: # %bb.0: +; ZVFH64-NEXT: addi sp, sp, -48 +; ZVFH64-NEXT: .cfi_def_cfa_offset 48 +; ZVFH64-NEXT: sd ra, 40(sp) # 8-byte Folded Spill +; ZVFH64-NEXT: sd s0, 32(sp) # 8-byte Folded Spill +; ZVFH64-NEXT: .cfi_offset ra, -8 +; ZVFH64-NEXT: .cfi_offset s0, -16 +; ZVFH64-NEXT: addi s0, sp, 48 +; ZVFH64-NEXT: .cfi_def_cfa s0, 0 +; ZVFH64-NEXT: csrr a0, vlenb +; ZVFH64-NEXT: slli a0, a0, 3 +; ZVFH64-NEXT: sub sp, sp, a0 +; ZVFH64-NEXT: andi sp, sp, -32 +; ZVFH64-NEXT: addi a0, sp, 32 +; ZVFH64-NEXT: vs4r.v v8, (a0) +; ZVFH64-NEXT: csrr a1, vlenb +; ZVFH64-NEXT: slli a2, a1, 2 +; ZVFH64-NEXT: add a2, a0, a2 +; ZVFH64-NEXT: slli a1, a1, 1 +; ZVFH64-NEXT: addi a1, a1, -1 +; ZVFH64-NEXT: li a3, 31 +; ZVFH64-NEXT: vs4r.v v12, (a2) +; ZVFH64-NEXT: bltu a1, a3, .LBB121_2 +; ZVFH64-NEXT: # %bb.1: +; ZVFH64-NEXT: li a1, 31 +; ZVFH64-NEXT: .LBB121_2: +; ZVFH64-NEXT: slli a1, a1, 1 +; ZVFH64-NEXT: add a0, a0, a1 +; ZVFH64-NEXT: vl4re16.v v8, (a0) +; ZVFH64-NEXT: addi sp, s0, -48 +; ZVFH64-NEXT: ld ra, 40(sp) # 8-byte Folded Reload +; ZVFH64-NEXT: ld s0, 32(sp) # 8-byte Folded Reload +; ZVFH64-NEXT: addi sp, sp, 48 +; ZVFH64-NEXT: ret + %res = call @llvm.vector.splice.nxv16bf16( %a, %b, i32 31) + ret %res +} + +declare @llvm.vector.splice.nxv32bf16(, , i32) + +define @splice_nxv32bf16_offset_zero( %a, %b) #0 { +; CHECK-LABEL: splice_nxv32bf16_offset_zero: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv32bf16( %a, %b, i32 0) + ret %res +} + +define @splice_nxv32bf16_offset_negone( %a, %b) #0 { +; ZVFHMIN64-LABEL: splice_nxv32bf16_offset_negone: +; ZVFHMIN64: # %bb.0: +; ZVFHMIN64-NEXT: addi sp, sp, -80 +; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 80 +; ZVFHMIN64-NEXT: sw ra, 76(sp) # 4-byte Folded Spill +; ZVFHMIN64-NEXT: sw s0, 72(sp) # 4-byte Folded Spill +; ZVFHMIN64-NEXT: .cfi_offset ra, -4 +; ZVFHMIN64-NEXT: .cfi_offset s0, -8 +; ZVFHMIN64-NEXT: addi s0, sp, 80 +; ZVFHMIN64-NEXT: .cfi_def_cfa s0, 0 +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: slli a0, a0, 4 +; ZVFHMIN64-NEXT: sub sp, sp, a0 +; ZVFHMIN64-NEXT: andi sp, sp, -64 +; ZVFHMIN64-NEXT: addi a0, sp, 64 +; ZVFHMIN64-NEXT: vs8r.v v8, (a0) +; ZVFHMIN64-NEXT: csrr a1, vlenb +; ZVFHMIN64-NEXT: slli a1, a1, 3 +; ZVFHMIN64-NEXT: add a0, a0, a1 +; ZVFHMIN64-NEXT: vs8r.v v16, (a0) +; ZVFHMIN64-NEXT: addi a0, a0, -2 +; ZVFHMIN64-NEXT: vl8re16.v v8, (a0) +; ZVFHMIN64-NEXT: addi sp, s0, -80 +; ZVFHMIN64-NEXT: lw ra, 76(sp) # 4-byte Folded Reload +; ZVFHMIN64-NEXT: lw s0, 72(sp) # 4-byte Folded Reload +; ZVFHMIN64-NEXT: addi sp, sp, 80 +; ZVFHMIN64-NEXT: ret +; +; ZVFHMIN32-LABEL: splice_nxv32bf16_offset_negone: +; ZVFHMIN32: # %bb.0: +; ZVFHMIN32-NEXT: addi sp, sp, -80 +; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 80 +; ZVFHMIN32-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; ZVFHMIN32-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; ZVFHMIN32-NEXT: .cfi_offset ra, -8 +; ZVFHMIN32-NEXT: .cfi_offset s0, -16 +; ZVFHMIN32-NEXT: addi s0, sp, 80 +; ZVFHMIN32-NEXT: .cfi_def_cfa s0, 0 +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: slli a0, a0, 4 +; ZVFHMIN32-NEXT: sub sp, sp, a0 +; ZVFHMIN32-NEXT: andi sp, sp, -64 +; ZVFHMIN32-NEXT: addi a0, sp, 64 +; ZVFHMIN32-NEXT: vs8r.v v8, (a0) +; ZVFHMIN32-NEXT: csrr a1, vlenb +; ZVFHMIN32-NEXT: slli a1, a1, 3 +; ZVFHMIN32-NEXT: add a0, a0, a1 +; ZVFHMIN32-NEXT: vs8r.v v16, (a0) +; ZVFHMIN32-NEXT: addi a0, a0, -2 +; ZVFHMIN32-NEXT: vl8re16.v v8, (a0) +; ZVFHMIN32-NEXT: addi sp, s0, -80 +; ZVFHMIN32-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: addi sp, sp, 80 +; ZVFHMIN32-NEXT: ret +; +; ZVFH32-LABEL: splice_nxv32bf16_offset_negone: +; ZVFH32: # %bb.0: +; ZVFH32-NEXT: addi sp, sp, -80 +; ZVFH32-NEXT: .cfi_def_cfa_offset 80 +; ZVFH32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill +; ZVFH32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill +; ZVFH32-NEXT: .cfi_offset ra, -4 +; ZVFH32-NEXT: .cfi_offset s0, -8 +; ZVFH32-NEXT: addi s0, sp, 80 +; ZVFH32-NEXT: .cfi_def_cfa s0, 0 +; ZVFH32-NEXT: csrr a0, vlenb +; ZVFH32-NEXT: slli a0, a0, 4 +; ZVFH32-NEXT: sub sp, sp, a0 +; ZVFH32-NEXT: andi sp, sp, -64 +; ZVFH32-NEXT: addi a0, sp, 64 +; ZVFH32-NEXT: vs8r.v v8, (a0) +; ZVFH32-NEXT: csrr a1, vlenb +; ZVFH32-NEXT: slli a1, a1, 3 +; ZVFH32-NEXT: add a0, a0, a1 +; ZVFH32-NEXT: vs8r.v v16, (a0) +; ZVFH32-NEXT: addi a0, a0, -2 +; ZVFH32-NEXT: vl8re16.v v8, (a0) +; ZVFH32-NEXT: addi sp, s0, -80 +; ZVFH32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload +; ZVFH32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload +; ZVFH32-NEXT: addi sp, sp, 80 +; ZVFH32-NEXT: ret +; +; ZVFH64-LABEL: splice_nxv32bf16_offset_negone: +; ZVFH64: # %bb.0: +; ZVFH64-NEXT: addi sp, sp, -80 +; ZVFH64-NEXT: .cfi_def_cfa_offset 80 +; ZVFH64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; ZVFH64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; ZVFH64-NEXT: .cfi_offset ra, -8 +; ZVFH64-NEXT: .cfi_offset s0, -16 +; ZVFH64-NEXT: addi s0, sp, 80 +; ZVFH64-NEXT: .cfi_def_cfa s0, 0 +; ZVFH64-NEXT: csrr a0, vlenb +; ZVFH64-NEXT: slli a0, a0, 4 +; ZVFH64-NEXT: sub sp, sp, a0 +; ZVFH64-NEXT: andi sp, sp, -64 +; ZVFH64-NEXT: addi a0, sp, 64 +; ZVFH64-NEXT: vs8r.v v8, (a0) +; ZVFH64-NEXT: csrr a1, vlenb +; ZVFH64-NEXT: slli a1, a1, 3 +; ZVFH64-NEXT: add a0, a0, a1 +; ZVFH64-NEXT: vs8r.v v16, (a0) +; ZVFH64-NEXT: addi a0, a0, -2 +; ZVFH64-NEXT: vl8re16.v v8, (a0) +; ZVFH64-NEXT: addi sp, s0, -80 +; ZVFH64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; ZVFH64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; ZVFH64-NEXT: addi sp, sp, 80 +; ZVFH64-NEXT: ret + %res = call @llvm.vector.splice.nxv32bf16( %a, %b, i32 -1) + ret %res +} + +define @splice_nxv32bf16_offset_min( %a, %b) #0 { +; ZVFHMIN64-LABEL: splice_nxv32bf16_offset_min: +; ZVFHMIN64: # %bb.0: +; ZVFHMIN64-NEXT: addi sp, sp, -80 +; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 80 +; ZVFHMIN64-NEXT: sw ra, 76(sp) # 4-byte Folded Spill +; ZVFHMIN64-NEXT: sw s0, 72(sp) # 4-byte Folded Spill +; ZVFHMIN64-NEXT: .cfi_offset ra, -4 +; ZVFHMIN64-NEXT: .cfi_offset s0, -8 +; ZVFHMIN64-NEXT: addi s0, sp, 80 +; ZVFHMIN64-NEXT: .cfi_def_cfa s0, 0 +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: slli a0, a0, 4 +; ZVFHMIN64-NEXT: sub sp, sp, a0 +; ZVFHMIN64-NEXT: andi sp, sp, -64 +; ZVFHMIN64-NEXT: addi a0, sp, 64 +; ZVFHMIN64-NEXT: vs8r.v v8, (a0) +; ZVFHMIN64-NEXT: csrr a1, vlenb +; ZVFHMIN64-NEXT: slli a1, a1, 3 +; ZVFHMIN64-NEXT: add a0, a0, a1 +; ZVFHMIN64-NEXT: li a2, 128 +; ZVFHMIN64-NEXT: vs8r.v v16, (a0) +; ZVFHMIN64-NEXT: bltu a1, a2, .LBB124_2 +; ZVFHMIN64-NEXT: # %bb.1: +; ZVFHMIN64-NEXT: li a1, 128 +; ZVFHMIN64-NEXT: .LBB124_2: +; ZVFHMIN64-NEXT: sub a0, a0, a1 +; ZVFHMIN64-NEXT: vl8re16.v v8, (a0) +; ZVFHMIN64-NEXT: addi sp, s0, -80 +; ZVFHMIN64-NEXT: lw ra, 76(sp) # 4-byte Folded Reload +; ZVFHMIN64-NEXT: lw s0, 72(sp) # 4-byte Folded Reload +; ZVFHMIN64-NEXT: addi sp, sp, 80 +; ZVFHMIN64-NEXT: ret +; +; ZVFHMIN32-LABEL: splice_nxv32bf16_offset_min: +; ZVFHMIN32: # %bb.0: +; ZVFHMIN32-NEXT: addi sp, sp, -80 +; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 80 +; ZVFHMIN32-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; ZVFHMIN32-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; ZVFHMIN32-NEXT: .cfi_offset ra, -8 +; ZVFHMIN32-NEXT: .cfi_offset s0, -16 +; ZVFHMIN32-NEXT: addi s0, sp, 80 +; ZVFHMIN32-NEXT: .cfi_def_cfa s0, 0 +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: slli a0, a0, 4 +; ZVFHMIN32-NEXT: sub sp, sp, a0 +; ZVFHMIN32-NEXT: andi sp, sp, -64 +; ZVFHMIN32-NEXT: addi a0, sp, 64 +; ZVFHMIN32-NEXT: vs8r.v v8, (a0) +; ZVFHMIN32-NEXT: csrr a1, vlenb +; ZVFHMIN32-NEXT: slli a1, a1, 3 +; ZVFHMIN32-NEXT: add a0, a0, a1 +; ZVFHMIN32-NEXT: li a2, 128 +; ZVFHMIN32-NEXT: vs8r.v v16, (a0) +; ZVFHMIN32-NEXT: bltu a1, a2, .LBB124_2 +; ZVFHMIN32-NEXT: # %bb.1: +; ZVFHMIN32-NEXT: li a1, 128 +; ZVFHMIN32-NEXT: .LBB124_2: +; ZVFHMIN32-NEXT: sub a0, a0, a1 +; ZVFHMIN32-NEXT: vl8re16.v v8, (a0) +; ZVFHMIN32-NEXT: addi sp, s0, -80 +; ZVFHMIN32-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: addi sp, sp, 80 +; ZVFHMIN32-NEXT: ret +; +; ZVFH32-LABEL: splice_nxv32bf16_offset_min: +; ZVFH32: # %bb.0: +; ZVFH32-NEXT: addi sp, sp, -80 +; ZVFH32-NEXT: .cfi_def_cfa_offset 80 +; ZVFH32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill +; ZVFH32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill +; ZVFH32-NEXT: .cfi_offset ra, -4 +; ZVFH32-NEXT: .cfi_offset s0, -8 +; ZVFH32-NEXT: addi s0, sp, 80 +; ZVFH32-NEXT: .cfi_def_cfa s0, 0 +; ZVFH32-NEXT: csrr a0, vlenb +; ZVFH32-NEXT: slli a0, a0, 4 +; ZVFH32-NEXT: sub sp, sp, a0 +; ZVFH32-NEXT: andi sp, sp, -64 +; ZVFH32-NEXT: addi a0, sp, 64 +; ZVFH32-NEXT: vs8r.v v8, (a0) +; ZVFH32-NEXT: csrr a1, vlenb +; ZVFH32-NEXT: slli a1, a1, 3 +; ZVFH32-NEXT: add a0, a0, a1 +; ZVFH32-NEXT: li a2, 128 +; ZVFH32-NEXT: vs8r.v v16, (a0) +; ZVFH32-NEXT: bltu a1, a2, .LBB124_2 +; ZVFH32-NEXT: # %bb.1: +; ZVFH32-NEXT: li a1, 128 +; ZVFH32-NEXT: .LBB124_2: +; ZVFH32-NEXT: sub a0, a0, a1 +; ZVFH32-NEXT: vl8re16.v v8, (a0) +; ZVFH32-NEXT: addi sp, s0, -80 +; ZVFH32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload +; ZVFH32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload +; ZVFH32-NEXT: addi sp, sp, 80 +; ZVFH32-NEXT: ret +; +; ZVFH64-LABEL: splice_nxv32bf16_offset_min: +; ZVFH64: # %bb.0: +; ZVFH64-NEXT: addi sp, sp, -80 +; ZVFH64-NEXT: .cfi_def_cfa_offset 80 +; ZVFH64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; ZVFH64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; ZVFH64-NEXT: .cfi_offset ra, -8 +; ZVFH64-NEXT: .cfi_offset s0, -16 +; ZVFH64-NEXT: addi s0, sp, 80 +; ZVFH64-NEXT: .cfi_def_cfa s0, 0 +; ZVFH64-NEXT: csrr a0, vlenb +; ZVFH64-NEXT: slli a0, a0, 4 +; ZVFH64-NEXT: sub sp, sp, a0 +; ZVFH64-NEXT: andi sp, sp, -64 +; ZVFH64-NEXT: addi a0, sp, 64 +; ZVFH64-NEXT: vs8r.v v8, (a0) +; ZVFH64-NEXT: csrr a1, vlenb +; ZVFH64-NEXT: slli a1, a1, 3 +; ZVFH64-NEXT: add a0, a0, a1 +; ZVFH64-NEXT: li a2, 128 +; ZVFH64-NEXT: vs8r.v v16, (a0) +; ZVFH64-NEXT: bltu a1, a2, .LBB124_2 +; ZVFH64-NEXT: # %bb.1: +; ZVFH64-NEXT: li a1, 128 +; ZVFH64-NEXT: .LBB124_2: +; ZVFH64-NEXT: sub a0, a0, a1 +; ZVFH64-NEXT: vl8re16.v v8, (a0) +; ZVFH64-NEXT: addi sp, s0, -80 +; ZVFH64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; ZVFH64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; ZVFH64-NEXT: addi sp, sp, 80 +; ZVFH64-NEXT: ret + %res = call @llvm.vector.splice.nxv32bf16( %a, %b, i32 -64) + ret %res +} + +define @splice_nxv32bf16_offset_max( %a, %b) #0 { +; ZVFHMIN64-LABEL: splice_nxv32bf16_offset_max: +; ZVFHMIN64: # %bb.0: +; ZVFHMIN64-NEXT: addi sp, sp, -80 +; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 80 +; ZVFHMIN64-NEXT: sw ra, 76(sp) # 4-byte Folded Spill +; ZVFHMIN64-NEXT: sw s0, 72(sp) # 4-byte Folded Spill +; ZVFHMIN64-NEXT: .cfi_offset ra, -4 +; ZVFHMIN64-NEXT: .cfi_offset s0, -8 +; ZVFHMIN64-NEXT: addi s0, sp, 80 +; ZVFHMIN64-NEXT: .cfi_def_cfa s0, 0 +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: slli a0, a0, 4 +; ZVFHMIN64-NEXT: sub sp, sp, a0 +; ZVFHMIN64-NEXT: andi sp, sp, -64 +; ZVFHMIN64-NEXT: addi a0, sp, 64 +; ZVFHMIN64-NEXT: vs8r.v v8, (a0) +; ZVFHMIN64-NEXT: csrr a1, vlenb +; ZVFHMIN64-NEXT: slli a2, a1, 3 +; ZVFHMIN64-NEXT: add a2, a0, a2 +; ZVFHMIN64-NEXT: slli a1, a1, 2 +; ZVFHMIN64-NEXT: addi a1, a1, -1 +; ZVFHMIN64-NEXT: li a3, 63 +; ZVFHMIN64-NEXT: vs8r.v v16, (a2) +; ZVFHMIN64-NEXT: bltu a1, a3, .LBB125_2 +; ZVFHMIN64-NEXT: # %bb.1: +; ZVFHMIN64-NEXT: li a1, 63 +; ZVFHMIN64-NEXT: .LBB125_2: +; ZVFHMIN64-NEXT: slli a1, a1, 1 +; ZVFHMIN64-NEXT: add a0, a0, a1 +; ZVFHMIN64-NEXT: vl8re16.v v8, (a0) +; ZVFHMIN64-NEXT: addi sp, s0, -80 +; ZVFHMIN64-NEXT: lw ra, 76(sp) # 4-byte Folded Reload +; ZVFHMIN64-NEXT: lw s0, 72(sp) # 4-byte Folded Reload +; ZVFHMIN64-NEXT: addi sp, sp, 80 +; ZVFHMIN64-NEXT: ret +; +; ZVFHMIN32-LABEL: splice_nxv32bf16_offset_max: +; ZVFHMIN32: # %bb.0: +; ZVFHMIN32-NEXT: addi sp, sp, -80 +; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 80 +; ZVFHMIN32-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; ZVFHMIN32-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; ZVFHMIN32-NEXT: .cfi_offset ra, -8 +; ZVFHMIN32-NEXT: .cfi_offset s0, -16 +; ZVFHMIN32-NEXT: addi s0, sp, 80 +; ZVFHMIN32-NEXT: .cfi_def_cfa s0, 0 +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: slli a0, a0, 4 +; ZVFHMIN32-NEXT: sub sp, sp, a0 +; ZVFHMIN32-NEXT: andi sp, sp, -64 +; ZVFHMIN32-NEXT: addi a0, sp, 64 +; ZVFHMIN32-NEXT: vs8r.v v8, (a0) +; ZVFHMIN32-NEXT: csrr a1, vlenb +; ZVFHMIN32-NEXT: slli a2, a1, 3 +; ZVFHMIN32-NEXT: add a2, a0, a2 +; ZVFHMIN32-NEXT: slli a1, a1, 2 +; ZVFHMIN32-NEXT: addi a1, a1, -1 +; ZVFHMIN32-NEXT: li a3, 63 +; ZVFHMIN32-NEXT: vs8r.v v16, (a2) +; ZVFHMIN32-NEXT: bltu a1, a3, .LBB125_2 +; ZVFHMIN32-NEXT: # %bb.1: +; ZVFHMIN32-NEXT: li a1, 63 +; ZVFHMIN32-NEXT: .LBB125_2: +; ZVFHMIN32-NEXT: slli a1, a1, 1 +; ZVFHMIN32-NEXT: add a0, a0, a1 +; ZVFHMIN32-NEXT: vl8re16.v v8, (a0) +; ZVFHMIN32-NEXT: addi sp, s0, -80 +; ZVFHMIN32-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: addi sp, sp, 80 +; ZVFHMIN32-NEXT: ret +; +; ZVFH32-LABEL: splice_nxv32bf16_offset_max: +; ZVFH32: # %bb.0: +; ZVFH32-NEXT: addi sp, sp, -80 +; ZVFH32-NEXT: .cfi_def_cfa_offset 80 +; ZVFH32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill +; ZVFH32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill +; ZVFH32-NEXT: .cfi_offset ra, -4 +; ZVFH32-NEXT: .cfi_offset s0, -8 +; ZVFH32-NEXT: addi s0, sp, 80 +; ZVFH32-NEXT: .cfi_def_cfa s0, 0 +; ZVFH32-NEXT: csrr a0, vlenb +; ZVFH32-NEXT: slli a0, a0, 4 +; ZVFH32-NEXT: sub sp, sp, a0 +; ZVFH32-NEXT: andi sp, sp, -64 +; ZVFH32-NEXT: addi a0, sp, 64 +; ZVFH32-NEXT: vs8r.v v8, (a0) +; ZVFH32-NEXT: csrr a1, vlenb +; ZVFH32-NEXT: slli a2, a1, 3 +; ZVFH32-NEXT: add a2, a0, a2 +; ZVFH32-NEXT: slli a1, a1, 2 +; ZVFH32-NEXT: addi a1, a1, -1 +; ZVFH32-NEXT: li a3, 63 +; ZVFH32-NEXT: vs8r.v v16, (a2) +; ZVFH32-NEXT: bltu a1, a3, .LBB125_2 +; ZVFH32-NEXT: # %bb.1: +; ZVFH32-NEXT: li a1, 63 +; ZVFH32-NEXT: .LBB125_2: +; ZVFH32-NEXT: slli a1, a1, 1 +; ZVFH32-NEXT: add a0, a0, a1 +; ZVFH32-NEXT: vl8re16.v v8, (a0) +; ZVFH32-NEXT: addi sp, s0, -80 +; ZVFH32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload +; ZVFH32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload +; ZVFH32-NEXT: addi sp, sp, 80 +; ZVFH32-NEXT: ret +; +; ZVFH64-LABEL: splice_nxv32bf16_offset_max: +; ZVFH64: # %bb.0: +; ZVFH64-NEXT: addi sp, sp, -80 +; ZVFH64-NEXT: .cfi_def_cfa_offset 80 +; ZVFH64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; ZVFH64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; ZVFH64-NEXT: .cfi_offset ra, -8 +; ZVFH64-NEXT: .cfi_offset s0, -16 +; ZVFH64-NEXT: addi s0, sp, 80 +; ZVFH64-NEXT: .cfi_def_cfa s0, 0 +; ZVFH64-NEXT: csrr a0, vlenb +; ZVFH64-NEXT: slli a0, a0, 4 +; ZVFH64-NEXT: sub sp, sp, a0 +; ZVFH64-NEXT: andi sp, sp, -64 +; ZVFH64-NEXT: addi a0, sp, 64 +; ZVFH64-NEXT: vs8r.v v8, (a0) +; ZVFH64-NEXT: csrr a1, vlenb +; ZVFH64-NEXT: slli a2, a1, 3 +; ZVFH64-NEXT: add a2, a0, a2 +; ZVFH64-NEXT: slli a1, a1, 2 +; ZVFH64-NEXT: addi a1, a1, -1 +; ZVFH64-NEXT: li a3, 63 +; ZVFH64-NEXT: vs8r.v v16, (a2) +; ZVFH64-NEXT: bltu a1, a3, .LBB125_2 +; ZVFH64-NEXT: # %bb.1: +; ZVFH64-NEXT: li a1, 63 +; ZVFH64-NEXT: .LBB125_2: +; ZVFH64-NEXT: slli a1, a1, 1 +; ZVFH64-NEXT: add a0, a0, a1 +; ZVFH64-NEXT: vl8re16.v v8, (a0) +; ZVFH64-NEXT: addi sp, s0, -80 +; ZVFH64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; ZVFH64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; ZVFH64-NEXT: addi sp, sp, 80 +; ZVFH64-NEXT: ret + %res = call @llvm.vector.splice.nxv32bf16( %a, %b, i32 63) + ret %res +} + declare @llvm.vector.splice.nxv1f16(, , i32) define @splice_nxv1f16_offset_zero( %a, %b) #0 { @@ -1544,45 +2760,229 @@ define @splice_nxv1f16_offset_zero( %a, < } define @splice_nxv1f16_offset_negone( %a, %b) #0 { -; CHECK-LABEL: splice_nxv1f16_offset_negone: -; CHECK: # %bb.0: -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a0, a0, 3 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vslideup.vi v8, v9, 1 -; CHECK-NEXT: ret +; ZVFHMIN64-LABEL: splice_nxv1f16_offset_negone: +; ZVFHMIN64: # %bb.0: +; ZVFHMIN64-NEXT: addi sp, sp, -16 +; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: sub sp, sp, a0 +; ZVFHMIN64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb +; ZVFHMIN64-NEXT: addi a0, sp, 16 +; ZVFHMIN64-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; ZVFHMIN64-NEXT: vse16.v v8, (a0) +; ZVFHMIN64-NEXT: csrr a1, vlenb +; ZVFHMIN64-NEXT: srli a1, a1, 2 +; ZVFHMIN64-NEXT: add a0, a0, a1 +; ZVFHMIN64-NEXT: vse16.v v9, (a0) +; ZVFHMIN64-NEXT: addi a0, a0, -2 +; ZVFHMIN64-NEXT: vle16.v v8, (a0) +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: add sp, sp, a0 +; ZVFHMIN64-NEXT: addi sp, sp, 16 +; ZVFHMIN64-NEXT: ret +; +; ZVFHMIN32-LABEL: splice_nxv1f16_offset_negone: +; ZVFHMIN32: # %bb.0: +; ZVFHMIN32-NEXT: addi sp, sp, -16 +; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: sub sp, sp, a0 +; ZVFHMIN32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb +; ZVFHMIN32-NEXT: addi a0, sp, 16 +; ZVFHMIN32-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; ZVFHMIN32-NEXT: vse16.v v8, (a0) +; ZVFHMIN32-NEXT: csrr a1, vlenb +; ZVFHMIN32-NEXT: srli a1, a1, 2 +; ZVFHMIN32-NEXT: add a0, a0, a1 +; ZVFHMIN32-NEXT: vse16.v v9, (a0) +; ZVFHMIN32-NEXT: addi a0, a0, -2 +; ZVFHMIN32-NEXT: vle16.v v8, (a0) +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: add sp, sp, a0 +; ZVFHMIN32-NEXT: addi sp, sp, 16 +; ZVFHMIN32-NEXT: ret +; +; ZVFH32-LABEL: splice_nxv1f16_offset_negone: +; ZVFH32: # %bb.0: +; ZVFH32-NEXT: csrr a0, vlenb +; ZVFH32-NEXT: srli a0, a0, 3 +; ZVFH32-NEXT: addi a0, a0, -1 +; ZVFH32-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; ZVFH32-NEXT: vslidedown.vx v8, v8, a0 +; ZVFH32-NEXT: vslideup.vi v8, v9, 1 +; ZVFH32-NEXT: ret +; +; ZVFH64-LABEL: splice_nxv1f16_offset_negone: +; ZVFH64: # %bb.0: +; ZVFH64-NEXT: csrr a0, vlenb +; ZVFH64-NEXT: srli a0, a0, 3 +; ZVFH64-NEXT: addi a0, a0, -1 +; ZVFH64-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; ZVFH64-NEXT: vslidedown.vx v8, v8, a0 +; ZVFH64-NEXT: vslideup.vi v8, v9, 1 +; ZVFH64-NEXT: ret %res = call @llvm.vector.splice.nxv1f16( %a, %b, i32 -1) ret %res } define @splice_nxv1f16_offset_min( %a, %b) #0 { -; CHECK-LABEL: splice_nxv1f16_offset_min: -; CHECK: # %bb.0: -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a0, a0, 3 -; CHECK-NEXT: addi a0, a0, -2 -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma -; CHECK-NEXT: vslideup.vi v8, v9, 2 -; CHECK-NEXT: ret +; ZVFHMIN64-LABEL: splice_nxv1f16_offset_min: +; ZVFHMIN64: # %bb.0: +; ZVFHMIN64-NEXT: addi sp, sp, -16 +; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: sub sp, sp, a0 +; ZVFHMIN64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb +; ZVFHMIN64-NEXT: addi a0, sp, 16 +; ZVFHMIN64-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; ZVFHMIN64-NEXT: vse16.v v8, (a0) +; ZVFHMIN64-NEXT: csrr a1, vlenb +; ZVFHMIN64-NEXT: srli a1, a1, 2 +; ZVFHMIN64-NEXT: add a0, a0, a1 +; ZVFHMIN64-NEXT: li a2, 4 +; ZVFHMIN64-NEXT: vse16.v v9, (a0) +; ZVFHMIN64-NEXT: bltu a1, a2, .LBB128_2 +; ZVFHMIN64-NEXT: # %bb.1: +; ZVFHMIN64-NEXT: li a1, 4 +; ZVFHMIN64-NEXT: .LBB128_2: +; ZVFHMIN64-NEXT: sub a0, a0, a1 +; ZVFHMIN64-NEXT: vle16.v v8, (a0) +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: add sp, sp, a0 +; ZVFHMIN64-NEXT: addi sp, sp, 16 +; ZVFHMIN64-NEXT: ret +; +; ZVFHMIN32-LABEL: splice_nxv1f16_offset_min: +; ZVFHMIN32: # %bb.0: +; ZVFHMIN32-NEXT: addi sp, sp, -16 +; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: sub sp, sp, a0 +; ZVFHMIN32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb +; ZVFHMIN32-NEXT: addi a0, sp, 16 +; ZVFHMIN32-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; ZVFHMIN32-NEXT: vse16.v v8, (a0) +; ZVFHMIN32-NEXT: csrr a1, vlenb +; ZVFHMIN32-NEXT: srli a1, a1, 2 +; ZVFHMIN32-NEXT: add a0, a0, a1 +; ZVFHMIN32-NEXT: li a2, 4 +; ZVFHMIN32-NEXT: vse16.v v9, (a0) +; ZVFHMIN32-NEXT: bltu a1, a2, .LBB128_2 +; ZVFHMIN32-NEXT: # %bb.1: +; ZVFHMIN32-NEXT: li a1, 4 +; ZVFHMIN32-NEXT: .LBB128_2: +; ZVFHMIN32-NEXT: sub a0, a0, a1 +; ZVFHMIN32-NEXT: vle16.v v8, (a0) +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: add sp, sp, a0 +; ZVFHMIN32-NEXT: addi sp, sp, 16 +; ZVFHMIN32-NEXT: ret +; +; ZVFH32-LABEL: splice_nxv1f16_offset_min: +; ZVFH32: # %bb.0: +; ZVFH32-NEXT: csrr a0, vlenb +; ZVFH32-NEXT: srli a0, a0, 3 +; ZVFH32-NEXT: addi a0, a0, -2 +; ZVFH32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; ZVFH32-NEXT: vslidedown.vx v8, v8, a0 +; ZVFH32-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; ZVFH32-NEXT: vslideup.vi v8, v9, 2 +; ZVFH32-NEXT: ret +; +; ZVFH64-LABEL: splice_nxv1f16_offset_min: +; ZVFH64: # %bb.0: +; ZVFH64-NEXT: csrr a0, vlenb +; ZVFH64-NEXT: srli a0, a0, 3 +; ZVFH64-NEXT: addi a0, a0, -2 +; ZVFH64-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; ZVFH64-NEXT: vslidedown.vx v8, v8, a0 +; ZVFH64-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; ZVFH64-NEXT: vslideup.vi v8, v9, 2 +; ZVFH64-NEXT: ret %res = call @llvm.vector.splice.nxv1f16( %a, %b, i32 -2) ret %res } define @splice_nxv1f16_offset_max( %a, %b) #0 { -; CHECK-LABEL: splice_nxv1f16_offset_max: -; CHECK: # %bb.0: -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a0, a0, 3 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v8, 1 -; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma -; CHECK-NEXT: vslideup.vx v8, v9, a0 -; CHECK-NEXT: ret +; ZVFHMIN64-LABEL: splice_nxv1f16_offset_max: +; ZVFHMIN64: # %bb.0: +; ZVFHMIN64-NEXT: addi sp, sp, -16 +; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: sub sp, sp, a0 +; ZVFHMIN64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb +; ZVFHMIN64-NEXT: addi a0, sp, 16 +; ZVFHMIN64-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; ZVFHMIN64-NEXT: vse16.v v8, (a0) +; ZVFHMIN64-NEXT: csrr a1, vlenb +; ZVFHMIN64-NEXT: srli a2, a1, 2 +; ZVFHMIN64-NEXT: add a2, a0, a2 +; ZVFHMIN64-NEXT: srli a1, a1, 3 +; ZVFHMIN64-NEXT: addi a1, a1, -1 +; ZVFHMIN64-NEXT: li a3, 1 +; ZVFHMIN64-NEXT: vse16.v v9, (a2) +; ZVFHMIN64-NEXT: bltu a1, a3, .LBB129_2 +; ZVFHMIN64-NEXT: # %bb.1: +; ZVFHMIN64-NEXT: li a1, 1 +; ZVFHMIN64-NEXT: .LBB129_2: +; ZVFHMIN64-NEXT: slli a1, a1, 1 +; ZVFHMIN64-NEXT: add a0, a0, a1 +; ZVFHMIN64-NEXT: vle16.v v8, (a0) +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: add sp, sp, a0 +; ZVFHMIN64-NEXT: addi sp, sp, 16 +; ZVFHMIN64-NEXT: ret +; +; ZVFHMIN32-LABEL: splice_nxv1f16_offset_max: +; ZVFHMIN32: # %bb.0: +; ZVFHMIN32-NEXT: addi sp, sp, -16 +; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: sub sp, sp, a0 +; ZVFHMIN32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb +; ZVFHMIN32-NEXT: addi a0, sp, 16 +; ZVFHMIN32-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; ZVFHMIN32-NEXT: vse16.v v8, (a0) +; ZVFHMIN32-NEXT: csrr a1, vlenb +; ZVFHMIN32-NEXT: srli a2, a1, 2 +; ZVFHMIN32-NEXT: add a2, a0, a2 +; ZVFHMIN32-NEXT: srli a1, a1, 3 +; ZVFHMIN32-NEXT: addi a1, a1, -1 +; ZVFHMIN32-NEXT: li a3, 1 +; ZVFHMIN32-NEXT: vse16.v v9, (a2) +; ZVFHMIN32-NEXT: bltu a1, a3, .LBB129_2 +; ZVFHMIN32-NEXT: # %bb.1: +; ZVFHMIN32-NEXT: li a1, 1 +; ZVFHMIN32-NEXT: .LBB129_2: +; ZVFHMIN32-NEXT: slli a1, a1, 1 +; ZVFHMIN32-NEXT: add a0, a0, a1 +; ZVFHMIN32-NEXT: vle16.v v8, (a0) +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: add sp, sp, a0 +; ZVFHMIN32-NEXT: addi sp, sp, 16 +; ZVFHMIN32-NEXT: ret +; +; ZVFH32-LABEL: splice_nxv1f16_offset_max: +; ZVFH32: # %bb.0: +; ZVFH32-NEXT: csrr a0, vlenb +; ZVFH32-NEXT: srli a0, a0, 3 +; ZVFH32-NEXT: addi a0, a0, -1 +; ZVFH32-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFH32-NEXT: vslidedown.vi v8, v8, 1 +; ZVFH32-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; ZVFH32-NEXT: vslideup.vx v8, v9, a0 +; ZVFH32-NEXT: ret +; +; ZVFH64-LABEL: splice_nxv1f16_offset_max: +; ZVFH64: # %bb.0: +; ZVFH64-NEXT: csrr a0, vlenb +; ZVFH64-NEXT: srli a0, a0, 3 +; ZVFH64-NEXT: addi a0, a0, -1 +; ZVFH64-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; ZVFH64-NEXT: vslidedown.vi v8, v8, 1 +; ZVFH64-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; ZVFH64-NEXT: vslideup.vx v8, v9, a0 +; ZVFH64-NEXT: ret %res = call @llvm.vector.splice.nxv1f16( %a, %b, i32 1) ret %res } @@ -1598,45 +2998,229 @@ define @splice_nxv2f16_offset_zero( %a, < } define @splice_nxv2f16_offset_negone( %a, %b) #0 { -; CHECK-LABEL: splice_nxv2f16_offset_negone: -; CHECK: # %bb.0: -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a0, a0, 2 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vslideup.vi v8, v9, 1 -; CHECK-NEXT: ret +; ZVFHMIN64-LABEL: splice_nxv2f16_offset_negone: +; ZVFHMIN64: # %bb.0: +; ZVFHMIN64-NEXT: addi sp, sp, -16 +; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: sub sp, sp, a0 +; ZVFHMIN64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb +; ZVFHMIN64-NEXT: addi a0, sp, 16 +; ZVFHMIN64-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; ZVFHMIN64-NEXT: vse16.v v8, (a0) +; ZVFHMIN64-NEXT: csrr a1, vlenb +; ZVFHMIN64-NEXT: srli a1, a1, 1 +; ZVFHMIN64-NEXT: add a0, a0, a1 +; ZVFHMIN64-NEXT: vse16.v v9, (a0) +; ZVFHMIN64-NEXT: addi a0, a0, -2 +; ZVFHMIN64-NEXT: vle16.v v8, (a0) +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: add sp, sp, a0 +; ZVFHMIN64-NEXT: addi sp, sp, 16 +; ZVFHMIN64-NEXT: ret +; +; ZVFHMIN32-LABEL: splice_nxv2f16_offset_negone: +; ZVFHMIN32: # %bb.0: +; ZVFHMIN32-NEXT: addi sp, sp, -16 +; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: sub sp, sp, a0 +; ZVFHMIN32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb +; ZVFHMIN32-NEXT: addi a0, sp, 16 +; ZVFHMIN32-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; ZVFHMIN32-NEXT: vse16.v v8, (a0) +; ZVFHMIN32-NEXT: csrr a1, vlenb +; ZVFHMIN32-NEXT: srli a1, a1, 1 +; ZVFHMIN32-NEXT: add a0, a0, a1 +; ZVFHMIN32-NEXT: vse16.v v9, (a0) +; ZVFHMIN32-NEXT: addi a0, a0, -2 +; ZVFHMIN32-NEXT: vle16.v v8, (a0) +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: add sp, sp, a0 +; ZVFHMIN32-NEXT: addi sp, sp, 16 +; ZVFHMIN32-NEXT: ret +; +; ZVFH32-LABEL: splice_nxv2f16_offset_negone: +; ZVFH32: # %bb.0: +; ZVFH32-NEXT: csrr a0, vlenb +; ZVFH32-NEXT: srli a0, a0, 2 +; ZVFH32-NEXT: addi a0, a0, -1 +; ZVFH32-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; ZVFH32-NEXT: vslidedown.vx v8, v8, a0 +; ZVFH32-NEXT: vslideup.vi v8, v9, 1 +; ZVFH32-NEXT: ret +; +; ZVFH64-LABEL: splice_nxv2f16_offset_negone: +; ZVFH64: # %bb.0: +; ZVFH64-NEXT: csrr a0, vlenb +; ZVFH64-NEXT: srli a0, a0, 2 +; ZVFH64-NEXT: addi a0, a0, -1 +; ZVFH64-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; ZVFH64-NEXT: vslidedown.vx v8, v8, a0 +; ZVFH64-NEXT: vslideup.vi v8, v9, 1 +; ZVFH64-NEXT: ret %res = call @llvm.vector.splice.nxv2f16( %a, %b, i32 -1) ret %res } define @splice_nxv2f16_offset_min( %a, %b) #0 { -; CHECK-LABEL: splice_nxv2f16_offset_min: -; CHECK: # %bb.0: -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a0, a0, 2 -; CHECK-NEXT: addi a0, a0, -4 -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma -; CHECK-NEXT: vslideup.vi v8, v9, 4 -; CHECK-NEXT: ret +; ZVFHMIN64-LABEL: splice_nxv2f16_offset_min: +; ZVFHMIN64: # %bb.0: +; ZVFHMIN64-NEXT: addi sp, sp, -16 +; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: sub sp, sp, a0 +; ZVFHMIN64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb +; ZVFHMIN64-NEXT: addi a0, sp, 16 +; ZVFHMIN64-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; ZVFHMIN64-NEXT: vse16.v v8, (a0) +; ZVFHMIN64-NEXT: csrr a1, vlenb +; ZVFHMIN64-NEXT: srli a1, a1, 1 +; ZVFHMIN64-NEXT: add a0, a0, a1 +; ZVFHMIN64-NEXT: li a2, 8 +; ZVFHMIN64-NEXT: vse16.v v9, (a0) +; ZVFHMIN64-NEXT: bltu a1, a2, .LBB132_2 +; ZVFHMIN64-NEXT: # %bb.1: +; ZVFHMIN64-NEXT: li a1, 8 +; ZVFHMIN64-NEXT: .LBB132_2: +; ZVFHMIN64-NEXT: sub a0, a0, a1 +; ZVFHMIN64-NEXT: vle16.v v8, (a0) +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: add sp, sp, a0 +; ZVFHMIN64-NEXT: addi sp, sp, 16 +; ZVFHMIN64-NEXT: ret +; +; ZVFHMIN32-LABEL: splice_nxv2f16_offset_min: +; ZVFHMIN32: # %bb.0: +; ZVFHMIN32-NEXT: addi sp, sp, -16 +; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: sub sp, sp, a0 +; ZVFHMIN32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb +; ZVFHMIN32-NEXT: addi a0, sp, 16 +; ZVFHMIN32-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; ZVFHMIN32-NEXT: vse16.v v8, (a0) +; ZVFHMIN32-NEXT: csrr a1, vlenb +; ZVFHMIN32-NEXT: srli a1, a1, 1 +; ZVFHMIN32-NEXT: add a0, a0, a1 +; ZVFHMIN32-NEXT: li a2, 8 +; ZVFHMIN32-NEXT: vse16.v v9, (a0) +; ZVFHMIN32-NEXT: bltu a1, a2, .LBB132_2 +; ZVFHMIN32-NEXT: # %bb.1: +; ZVFHMIN32-NEXT: li a1, 8 +; ZVFHMIN32-NEXT: .LBB132_2: +; ZVFHMIN32-NEXT: sub a0, a0, a1 +; ZVFHMIN32-NEXT: vle16.v v8, (a0) +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: add sp, sp, a0 +; ZVFHMIN32-NEXT: addi sp, sp, 16 +; ZVFHMIN32-NEXT: ret +; +; ZVFH32-LABEL: splice_nxv2f16_offset_min: +; ZVFH32: # %bb.0: +; ZVFH32-NEXT: csrr a0, vlenb +; ZVFH32-NEXT: srli a0, a0, 2 +; ZVFH32-NEXT: addi a0, a0, -4 +; ZVFH32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; ZVFH32-NEXT: vslidedown.vx v8, v8, a0 +; ZVFH32-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; ZVFH32-NEXT: vslideup.vi v8, v9, 4 +; ZVFH32-NEXT: ret +; +; ZVFH64-LABEL: splice_nxv2f16_offset_min: +; ZVFH64: # %bb.0: +; ZVFH64-NEXT: csrr a0, vlenb +; ZVFH64-NEXT: srli a0, a0, 2 +; ZVFH64-NEXT: addi a0, a0, -4 +; ZVFH64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; ZVFH64-NEXT: vslidedown.vx v8, v8, a0 +; ZVFH64-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; ZVFH64-NEXT: vslideup.vi v8, v9, 4 +; ZVFH64-NEXT: ret %res = call @llvm.vector.splice.nxv2f16( %a, %b, i32 -4) ret %res } define @splice_nxv2f16_offset_max( %a, %b) #0 { -; CHECK-LABEL: splice_nxv2f16_offset_max: -; CHECK: # %bb.0: -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a0, a0, 2 -; CHECK-NEXT: addi a0, a0, -3 -; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v8, 3 -; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma -; CHECK-NEXT: vslideup.vx v8, v9, a0 -; CHECK-NEXT: ret +; ZVFHMIN64-LABEL: splice_nxv2f16_offset_max: +; ZVFHMIN64: # %bb.0: +; ZVFHMIN64-NEXT: addi sp, sp, -16 +; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: sub sp, sp, a0 +; ZVFHMIN64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb +; ZVFHMIN64-NEXT: addi a0, sp, 16 +; ZVFHMIN64-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; ZVFHMIN64-NEXT: vse16.v v8, (a0) +; ZVFHMIN64-NEXT: csrr a1, vlenb +; ZVFHMIN64-NEXT: srli a2, a1, 1 +; ZVFHMIN64-NEXT: add a2, a0, a2 +; ZVFHMIN64-NEXT: srli a1, a1, 2 +; ZVFHMIN64-NEXT: addi a1, a1, -1 +; ZVFHMIN64-NEXT: li a3, 3 +; ZVFHMIN64-NEXT: vse16.v v9, (a2) +; ZVFHMIN64-NEXT: bltu a1, a3, .LBB133_2 +; ZVFHMIN64-NEXT: # %bb.1: +; ZVFHMIN64-NEXT: li a1, 3 +; ZVFHMIN64-NEXT: .LBB133_2: +; ZVFHMIN64-NEXT: slli a1, a1, 1 +; ZVFHMIN64-NEXT: add a0, a0, a1 +; ZVFHMIN64-NEXT: vle16.v v8, (a0) +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: add sp, sp, a0 +; ZVFHMIN64-NEXT: addi sp, sp, 16 +; ZVFHMIN64-NEXT: ret +; +; ZVFHMIN32-LABEL: splice_nxv2f16_offset_max: +; ZVFHMIN32: # %bb.0: +; ZVFHMIN32-NEXT: addi sp, sp, -16 +; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: sub sp, sp, a0 +; ZVFHMIN32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb +; ZVFHMIN32-NEXT: addi a0, sp, 16 +; ZVFHMIN32-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; ZVFHMIN32-NEXT: vse16.v v8, (a0) +; ZVFHMIN32-NEXT: csrr a1, vlenb +; ZVFHMIN32-NEXT: srli a2, a1, 1 +; ZVFHMIN32-NEXT: add a2, a0, a2 +; ZVFHMIN32-NEXT: srli a1, a1, 2 +; ZVFHMIN32-NEXT: addi a1, a1, -1 +; ZVFHMIN32-NEXT: li a3, 3 +; ZVFHMIN32-NEXT: vse16.v v9, (a2) +; ZVFHMIN32-NEXT: bltu a1, a3, .LBB133_2 +; ZVFHMIN32-NEXT: # %bb.1: +; ZVFHMIN32-NEXT: li a1, 3 +; ZVFHMIN32-NEXT: .LBB133_2: +; ZVFHMIN32-NEXT: slli a1, a1, 1 +; ZVFHMIN32-NEXT: add a0, a0, a1 +; ZVFHMIN32-NEXT: vle16.v v8, (a0) +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: add sp, sp, a0 +; ZVFHMIN32-NEXT: addi sp, sp, 16 +; ZVFHMIN32-NEXT: ret +; +; ZVFH32-LABEL: splice_nxv2f16_offset_max: +; ZVFH32: # %bb.0: +; ZVFH32-NEXT: csrr a0, vlenb +; ZVFH32-NEXT: srli a0, a0, 2 +; ZVFH32-NEXT: addi a0, a0, -3 +; ZVFH32-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; ZVFH32-NEXT: vslidedown.vi v8, v8, 3 +; ZVFH32-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; ZVFH32-NEXT: vslideup.vx v8, v9, a0 +; ZVFH32-NEXT: ret +; +; ZVFH64-LABEL: splice_nxv2f16_offset_max: +; ZVFH64: # %bb.0: +; ZVFH64-NEXT: csrr a0, vlenb +; ZVFH64-NEXT: srli a0, a0, 2 +; ZVFH64-NEXT: addi a0, a0, -3 +; ZVFH64-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; ZVFH64-NEXT: vslidedown.vi v8, v8, 3 +; ZVFH64-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; ZVFH64-NEXT: vslideup.vx v8, v9, a0 +; ZVFH64-NEXT: ret %res = call @llvm.vector.splice.nxv2f16( %a, %b, i32 3) ret %res } @@ -1652,45 +3236,229 @@ define @splice_nxv4f16_offset_zero( %a, < } define @splice_nxv4f16_offset_negone( %a, %b) #0 { -; CHECK-LABEL: splice_nxv4f16_offset_negone: -; CHECK: # %bb.0: -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a0, a0, 1 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vslideup.vi v8, v9, 1 -; CHECK-NEXT: ret +; ZVFHMIN64-LABEL: splice_nxv4f16_offset_negone: +; ZVFHMIN64: # %bb.0: +; ZVFHMIN64-NEXT: addi sp, sp, -16 +; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: slli a0, a0, 1 +; ZVFHMIN64-NEXT: sub sp, sp, a0 +; ZVFHMIN64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb +; ZVFHMIN64-NEXT: addi a0, sp, 16 +; ZVFHMIN64-NEXT: vs1r.v v8, (a0) +; ZVFHMIN64-NEXT: csrr a1, vlenb +; ZVFHMIN64-NEXT: add a0, a0, a1 +; ZVFHMIN64-NEXT: vs1r.v v9, (a0) +; ZVFHMIN64-NEXT: addi a0, a0, -2 +; ZVFHMIN64-NEXT: vl1re16.v v8, (a0) +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: slli a0, a0, 1 +; ZVFHMIN64-NEXT: add sp, sp, a0 +; ZVFHMIN64-NEXT: addi sp, sp, 16 +; ZVFHMIN64-NEXT: ret +; +; ZVFHMIN32-LABEL: splice_nxv4f16_offset_negone: +; ZVFHMIN32: # %bb.0: +; ZVFHMIN32-NEXT: addi sp, sp, -16 +; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: slli a0, a0, 1 +; ZVFHMIN32-NEXT: sub sp, sp, a0 +; ZVFHMIN32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb +; ZVFHMIN32-NEXT: addi a0, sp, 16 +; ZVFHMIN32-NEXT: vs1r.v v8, (a0) +; ZVFHMIN32-NEXT: csrr a1, vlenb +; ZVFHMIN32-NEXT: add a0, a0, a1 +; ZVFHMIN32-NEXT: vs1r.v v9, (a0) +; ZVFHMIN32-NEXT: addi a0, a0, -2 +; ZVFHMIN32-NEXT: vl1re16.v v8, (a0) +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: slli a0, a0, 1 +; ZVFHMIN32-NEXT: add sp, sp, a0 +; ZVFHMIN32-NEXT: addi sp, sp, 16 +; ZVFHMIN32-NEXT: ret +; +; ZVFH32-LABEL: splice_nxv4f16_offset_negone: +; ZVFH32: # %bb.0: +; ZVFH32-NEXT: csrr a0, vlenb +; ZVFH32-NEXT: srli a0, a0, 1 +; ZVFH32-NEXT: addi a0, a0, -1 +; ZVFH32-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; ZVFH32-NEXT: vslidedown.vx v8, v8, a0 +; ZVFH32-NEXT: vslideup.vi v8, v9, 1 +; ZVFH32-NEXT: ret +; +; ZVFH64-LABEL: splice_nxv4f16_offset_negone: +; ZVFH64: # %bb.0: +; ZVFH64-NEXT: csrr a0, vlenb +; ZVFH64-NEXT: srli a0, a0, 1 +; ZVFH64-NEXT: addi a0, a0, -1 +; ZVFH64-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; ZVFH64-NEXT: vslidedown.vx v8, v8, a0 +; ZVFH64-NEXT: vslideup.vi v8, v9, 1 +; ZVFH64-NEXT: ret %res = call @llvm.vector.splice.nxv4f16( %a, %b, i32 -1) ret %res } define @splice_nxv4f16_offset_min( %a, %b) #0 { -; CHECK-LABEL: splice_nxv4f16_offset_min: -; CHECK: # %bb.0: -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a0, a0, 1 -; CHECK-NEXT: addi a0, a0, -8 -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; CHECK-NEXT: vslideup.vi v8, v9, 8 -; CHECK-NEXT: ret +; ZVFHMIN64-LABEL: splice_nxv4f16_offset_min: +; ZVFHMIN64: # %bb.0: +; ZVFHMIN64-NEXT: addi sp, sp, -16 +; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: slli a0, a0, 1 +; ZVFHMIN64-NEXT: sub sp, sp, a0 +; ZVFHMIN64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb +; ZVFHMIN64-NEXT: addi a0, sp, 16 +; ZVFHMIN64-NEXT: vs1r.v v8, (a0) +; ZVFHMIN64-NEXT: csrr a1, vlenb +; ZVFHMIN64-NEXT: add a0, a0, a1 +; ZVFHMIN64-NEXT: li a2, 16 +; ZVFHMIN64-NEXT: vs1r.v v9, (a0) +; ZVFHMIN64-NEXT: bltu a1, a2, .LBB136_2 +; ZVFHMIN64-NEXT: # %bb.1: +; ZVFHMIN64-NEXT: li a1, 16 +; ZVFHMIN64-NEXT: .LBB136_2: +; ZVFHMIN64-NEXT: sub a0, a0, a1 +; ZVFHMIN64-NEXT: vl1re16.v v8, (a0) +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: slli a0, a0, 1 +; ZVFHMIN64-NEXT: add sp, sp, a0 +; ZVFHMIN64-NEXT: addi sp, sp, 16 +; ZVFHMIN64-NEXT: ret +; +; ZVFHMIN32-LABEL: splice_nxv4f16_offset_min: +; ZVFHMIN32: # %bb.0: +; ZVFHMIN32-NEXT: addi sp, sp, -16 +; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: slli a0, a0, 1 +; ZVFHMIN32-NEXT: sub sp, sp, a0 +; ZVFHMIN32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb +; ZVFHMIN32-NEXT: addi a0, sp, 16 +; ZVFHMIN32-NEXT: vs1r.v v8, (a0) +; ZVFHMIN32-NEXT: csrr a1, vlenb +; ZVFHMIN32-NEXT: add a0, a0, a1 +; ZVFHMIN32-NEXT: li a2, 16 +; ZVFHMIN32-NEXT: vs1r.v v9, (a0) +; ZVFHMIN32-NEXT: bltu a1, a2, .LBB136_2 +; ZVFHMIN32-NEXT: # %bb.1: +; ZVFHMIN32-NEXT: li a1, 16 +; ZVFHMIN32-NEXT: .LBB136_2: +; ZVFHMIN32-NEXT: sub a0, a0, a1 +; ZVFHMIN32-NEXT: vl1re16.v v8, (a0) +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: slli a0, a0, 1 +; ZVFHMIN32-NEXT: add sp, sp, a0 +; ZVFHMIN32-NEXT: addi sp, sp, 16 +; ZVFHMIN32-NEXT: ret +; +; ZVFH32-LABEL: splice_nxv4f16_offset_min: +; ZVFH32: # %bb.0: +; ZVFH32-NEXT: csrr a0, vlenb +; ZVFH32-NEXT: srli a0, a0, 1 +; ZVFH32-NEXT: addi a0, a0, -8 +; ZVFH32-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFH32-NEXT: vslidedown.vx v8, v8, a0 +; ZVFH32-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; ZVFH32-NEXT: vslideup.vi v8, v9, 8 +; ZVFH32-NEXT: ret +; +; ZVFH64-LABEL: splice_nxv4f16_offset_min: +; ZVFH64: # %bb.0: +; ZVFH64-NEXT: csrr a0, vlenb +; ZVFH64-NEXT: srli a0, a0, 1 +; ZVFH64-NEXT: addi a0, a0, -8 +; ZVFH64-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFH64-NEXT: vslidedown.vx v8, v8, a0 +; ZVFH64-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; ZVFH64-NEXT: vslideup.vi v8, v9, 8 +; ZVFH64-NEXT: ret %res = call @llvm.vector.splice.nxv4f16( %a, %b, i32 -8) ret %res } define @splice_nxv4f16_offset_max( %a, %b) #0 { -; CHECK-LABEL: splice_nxv4f16_offset_max: -; CHECK: # %bb.0: -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a0, a0, 1 -; CHECK-NEXT: addi a0, a0, -7 -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v8, 7 -; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma -; CHECK-NEXT: vslideup.vx v8, v9, a0 -; CHECK-NEXT: ret +; ZVFHMIN64-LABEL: splice_nxv4f16_offset_max: +; ZVFHMIN64: # %bb.0: +; ZVFHMIN64-NEXT: addi sp, sp, -16 +; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: slli a0, a0, 1 +; ZVFHMIN64-NEXT: sub sp, sp, a0 +; ZVFHMIN64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb +; ZVFHMIN64-NEXT: addi a0, sp, 16 +; ZVFHMIN64-NEXT: vs1r.v v8, (a0) +; ZVFHMIN64-NEXT: csrr a1, vlenb +; ZVFHMIN64-NEXT: add a2, a0, a1 +; ZVFHMIN64-NEXT: srli a1, a1, 1 +; ZVFHMIN64-NEXT: addi a1, a1, -1 +; ZVFHMIN64-NEXT: li a3, 7 +; ZVFHMIN64-NEXT: vs1r.v v9, (a2) +; ZVFHMIN64-NEXT: bltu a1, a3, .LBB137_2 +; ZVFHMIN64-NEXT: # %bb.1: +; ZVFHMIN64-NEXT: li a1, 7 +; ZVFHMIN64-NEXT: .LBB137_2: +; ZVFHMIN64-NEXT: slli a1, a1, 1 +; ZVFHMIN64-NEXT: add a0, a0, a1 +; ZVFHMIN64-NEXT: vl1re16.v v8, (a0) +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: slli a0, a0, 1 +; ZVFHMIN64-NEXT: add sp, sp, a0 +; ZVFHMIN64-NEXT: addi sp, sp, 16 +; ZVFHMIN64-NEXT: ret +; +; ZVFHMIN32-LABEL: splice_nxv4f16_offset_max: +; ZVFHMIN32: # %bb.0: +; ZVFHMIN32-NEXT: addi sp, sp, -16 +; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: slli a0, a0, 1 +; ZVFHMIN32-NEXT: sub sp, sp, a0 +; ZVFHMIN32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb +; ZVFHMIN32-NEXT: addi a0, sp, 16 +; ZVFHMIN32-NEXT: vs1r.v v8, (a0) +; ZVFHMIN32-NEXT: csrr a1, vlenb +; ZVFHMIN32-NEXT: add a2, a0, a1 +; ZVFHMIN32-NEXT: srli a1, a1, 1 +; ZVFHMIN32-NEXT: addi a1, a1, -1 +; ZVFHMIN32-NEXT: li a3, 7 +; ZVFHMIN32-NEXT: vs1r.v v9, (a2) +; ZVFHMIN32-NEXT: bltu a1, a3, .LBB137_2 +; ZVFHMIN32-NEXT: # %bb.1: +; ZVFHMIN32-NEXT: li a1, 7 +; ZVFHMIN32-NEXT: .LBB137_2: +; ZVFHMIN32-NEXT: slli a1, a1, 1 +; ZVFHMIN32-NEXT: add a0, a0, a1 +; ZVFHMIN32-NEXT: vl1re16.v v8, (a0) +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: slli a0, a0, 1 +; ZVFHMIN32-NEXT: add sp, sp, a0 +; ZVFHMIN32-NEXT: addi sp, sp, 16 +; ZVFHMIN32-NEXT: ret +; +; ZVFH32-LABEL: splice_nxv4f16_offset_max: +; ZVFH32: # %bb.0: +; ZVFH32-NEXT: csrr a0, vlenb +; ZVFH32-NEXT: srli a0, a0, 1 +; ZVFH32-NEXT: addi a0, a0, -7 +; ZVFH32-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVFH32-NEXT: vslidedown.vi v8, v8, 7 +; ZVFH32-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; ZVFH32-NEXT: vslideup.vx v8, v9, a0 +; ZVFH32-NEXT: ret +; +; ZVFH64-LABEL: splice_nxv4f16_offset_max: +; ZVFH64: # %bb.0: +; ZVFH64-NEXT: csrr a0, vlenb +; ZVFH64-NEXT: srli a0, a0, 1 +; ZVFH64-NEXT: addi a0, a0, -7 +; ZVFH64-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVFH64-NEXT: vslidedown.vi v8, v8, 7 +; ZVFH64-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; ZVFH64-NEXT: vslideup.vx v8, v9, a0 +; ZVFH64-NEXT: ret %res = call @llvm.vector.splice.nxv4f16( %a, %b, i32 7) ret %res } @@ -1706,43 +3474,229 @@ define @splice_nxv8f16_offset_zero( %a, < } define @splice_nxv8f16_offset_negone( %a, %b) #0 { -; CHECK-LABEL: splice_nxv8f16_offset_negone: -; CHECK: # %bb.0: -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: vsetivli zero, 1, e16, m2, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma -; CHECK-NEXT: vslideup.vi v8, v10, 1 -; CHECK-NEXT: ret +; ZVFHMIN64-LABEL: splice_nxv8f16_offset_negone: +; ZVFHMIN64: # %bb.0: +; ZVFHMIN64-NEXT: addi sp, sp, -16 +; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: slli a0, a0, 2 +; ZVFHMIN64-NEXT: sub sp, sp, a0 +; ZVFHMIN64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb +; ZVFHMIN64-NEXT: addi a0, sp, 16 +; ZVFHMIN64-NEXT: vs2r.v v8, (a0) +; ZVFHMIN64-NEXT: csrr a1, vlenb +; ZVFHMIN64-NEXT: slli a1, a1, 1 +; ZVFHMIN64-NEXT: add a0, a0, a1 +; ZVFHMIN64-NEXT: vs2r.v v10, (a0) +; ZVFHMIN64-NEXT: addi a0, a0, -2 +; ZVFHMIN64-NEXT: vl2re16.v v8, (a0) +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: slli a0, a0, 2 +; ZVFHMIN64-NEXT: add sp, sp, a0 +; ZVFHMIN64-NEXT: addi sp, sp, 16 +; ZVFHMIN64-NEXT: ret +; +; ZVFHMIN32-LABEL: splice_nxv8f16_offset_negone: +; ZVFHMIN32: # %bb.0: +; ZVFHMIN32-NEXT: addi sp, sp, -16 +; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: slli a0, a0, 2 +; ZVFHMIN32-NEXT: sub sp, sp, a0 +; ZVFHMIN32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb +; ZVFHMIN32-NEXT: addi a0, sp, 16 +; ZVFHMIN32-NEXT: vs2r.v v8, (a0) +; ZVFHMIN32-NEXT: csrr a1, vlenb +; ZVFHMIN32-NEXT: slli a1, a1, 1 +; ZVFHMIN32-NEXT: add a0, a0, a1 +; ZVFHMIN32-NEXT: vs2r.v v10, (a0) +; ZVFHMIN32-NEXT: addi a0, a0, -2 +; ZVFHMIN32-NEXT: vl2re16.v v8, (a0) +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: slli a0, a0, 2 +; ZVFHMIN32-NEXT: add sp, sp, a0 +; ZVFHMIN32-NEXT: addi sp, sp, 16 +; ZVFHMIN32-NEXT: ret +; +; ZVFH32-LABEL: splice_nxv8f16_offset_negone: +; ZVFH32: # %bb.0: +; ZVFH32-NEXT: csrr a0, vlenb +; ZVFH32-NEXT: addi a0, a0, -1 +; ZVFH32-NEXT: vsetivli zero, 1, e16, m2, ta, ma +; ZVFH32-NEXT: vslidedown.vx v8, v8, a0 +; ZVFH32-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; ZVFH32-NEXT: vslideup.vi v8, v10, 1 +; ZVFH32-NEXT: ret +; +; ZVFH64-LABEL: splice_nxv8f16_offset_negone: +; ZVFH64: # %bb.0: +; ZVFH64-NEXT: csrr a0, vlenb +; ZVFH64-NEXT: addi a0, a0, -1 +; ZVFH64-NEXT: vsetivli zero, 1, e16, m2, ta, ma +; ZVFH64-NEXT: vslidedown.vx v8, v8, a0 +; ZVFH64-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; ZVFH64-NEXT: vslideup.vi v8, v10, 1 +; ZVFH64-NEXT: ret %res = call @llvm.vector.splice.nxv8f16( %a, %b, i32 -1) ret %res } define @splice_nxv8f16_offset_min( %a, %b) #0 { -; CHECK-LABEL: splice_nxv8f16_offset_min: -; CHECK: # %bb.0: -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: addi a0, a0, -16 -; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma -; CHECK-NEXT: vslideup.vi v8, v10, 16 -; CHECK-NEXT: ret +; ZVFHMIN64-LABEL: splice_nxv8f16_offset_min: +; ZVFHMIN64: # %bb.0: +; ZVFHMIN64-NEXT: addi sp, sp, -16 +; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: slli a0, a0, 2 +; ZVFHMIN64-NEXT: sub sp, sp, a0 +; ZVFHMIN64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb +; ZVFHMIN64-NEXT: addi a0, sp, 16 +; ZVFHMIN64-NEXT: vs2r.v v8, (a0) +; ZVFHMIN64-NEXT: csrr a1, vlenb +; ZVFHMIN64-NEXT: slli a1, a1, 1 +; ZVFHMIN64-NEXT: add a0, a0, a1 +; ZVFHMIN64-NEXT: li a2, 32 +; ZVFHMIN64-NEXT: vs2r.v v10, (a0) +; ZVFHMIN64-NEXT: bltu a1, a2, .LBB140_2 +; ZVFHMIN64-NEXT: # %bb.1: +; ZVFHMIN64-NEXT: li a1, 32 +; ZVFHMIN64-NEXT: .LBB140_2: +; ZVFHMIN64-NEXT: sub a0, a0, a1 +; ZVFHMIN64-NEXT: vl2re16.v v8, (a0) +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: slli a0, a0, 2 +; ZVFHMIN64-NEXT: add sp, sp, a0 +; ZVFHMIN64-NEXT: addi sp, sp, 16 +; ZVFHMIN64-NEXT: ret +; +; ZVFHMIN32-LABEL: splice_nxv8f16_offset_min: +; ZVFHMIN32: # %bb.0: +; ZVFHMIN32-NEXT: addi sp, sp, -16 +; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: slli a0, a0, 2 +; ZVFHMIN32-NEXT: sub sp, sp, a0 +; ZVFHMIN32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb +; ZVFHMIN32-NEXT: addi a0, sp, 16 +; ZVFHMIN32-NEXT: vs2r.v v8, (a0) +; ZVFHMIN32-NEXT: csrr a1, vlenb +; ZVFHMIN32-NEXT: slli a1, a1, 1 +; ZVFHMIN32-NEXT: add a0, a0, a1 +; ZVFHMIN32-NEXT: li a2, 32 +; ZVFHMIN32-NEXT: vs2r.v v10, (a0) +; ZVFHMIN32-NEXT: bltu a1, a2, .LBB140_2 +; ZVFHMIN32-NEXT: # %bb.1: +; ZVFHMIN32-NEXT: li a1, 32 +; ZVFHMIN32-NEXT: .LBB140_2: +; ZVFHMIN32-NEXT: sub a0, a0, a1 +; ZVFHMIN32-NEXT: vl2re16.v v8, (a0) +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: slli a0, a0, 2 +; ZVFHMIN32-NEXT: add sp, sp, a0 +; ZVFHMIN32-NEXT: addi sp, sp, 16 +; ZVFHMIN32-NEXT: ret +; +; ZVFH32-LABEL: splice_nxv8f16_offset_min: +; ZVFH32: # %bb.0: +; ZVFH32-NEXT: csrr a0, vlenb +; ZVFH32-NEXT: addi a0, a0, -16 +; ZVFH32-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; ZVFH32-NEXT: vslidedown.vx v8, v8, a0 +; ZVFH32-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; ZVFH32-NEXT: vslideup.vi v8, v10, 16 +; ZVFH32-NEXT: ret +; +; ZVFH64-LABEL: splice_nxv8f16_offset_min: +; ZVFH64: # %bb.0: +; ZVFH64-NEXT: csrr a0, vlenb +; ZVFH64-NEXT: addi a0, a0, -16 +; ZVFH64-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; ZVFH64-NEXT: vslidedown.vx v8, v8, a0 +; ZVFH64-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; ZVFH64-NEXT: vslideup.vi v8, v10, 16 +; ZVFH64-NEXT: ret %res = call @llvm.vector.splice.nxv8f16( %a, %b, i32 -16) ret %res } define @splice_nxv8f16_offset_max( %a, %b) #0 { -; CHECK-LABEL: splice_nxv8f16_offset_max: -; CHECK: # %bb.0: -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: addi a0, a0, -15 -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v8, 15 -; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; CHECK-NEXT: vslideup.vx v8, v10, a0 -; CHECK-NEXT: ret +; ZVFHMIN64-LABEL: splice_nxv8f16_offset_max: +; ZVFHMIN64: # %bb.0: +; ZVFHMIN64-NEXT: addi sp, sp, -16 +; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: slli a0, a0, 2 +; ZVFHMIN64-NEXT: sub sp, sp, a0 +; ZVFHMIN64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb +; ZVFHMIN64-NEXT: addi a0, sp, 16 +; ZVFHMIN64-NEXT: vs2r.v v8, (a0) +; ZVFHMIN64-NEXT: csrr a1, vlenb +; ZVFHMIN64-NEXT: slli a2, a1, 1 +; ZVFHMIN64-NEXT: add a2, a0, a2 +; ZVFHMIN64-NEXT: addi a1, a1, -1 +; ZVFHMIN64-NEXT: li a3, 15 +; ZVFHMIN64-NEXT: vs2r.v v10, (a2) +; ZVFHMIN64-NEXT: bltu a1, a3, .LBB141_2 +; ZVFHMIN64-NEXT: # %bb.1: +; ZVFHMIN64-NEXT: li a1, 15 +; ZVFHMIN64-NEXT: .LBB141_2: +; ZVFHMIN64-NEXT: slli a1, a1, 1 +; ZVFHMIN64-NEXT: add a0, a0, a1 +; ZVFHMIN64-NEXT: vl2re16.v v8, (a0) +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: slli a0, a0, 2 +; ZVFHMIN64-NEXT: add sp, sp, a0 +; ZVFHMIN64-NEXT: addi sp, sp, 16 +; ZVFHMIN64-NEXT: ret +; +; ZVFHMIN32-LABEL: splice_nxv8f16_offset_max: +; ZVFHMIN32: # %bb.0: +; ZVFHMIN32-NEXT: addi sp, sp, -16 +; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: slli a0, a0, 2 +; ZVFHMIN32-NEXT: sub sp, sp, a0 +; ZVFHMIN32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb +; ZVFHMIN32-NEXT: addi a0, sp, 16 +; ZVFHMIN32-NEXT: vs2r.v v8, (a0) +; ZVFHMIN32-NEXT: csrr a1, vlenb +; ZVFHMIN32-NEXT: slli a2, a1, 1 +; ZVFHMIN32-NEXT: add a2, a0, a2 +; ZVFHMIN32-NEXT: addi a1, a1, -1 +; ZVFHMIN32-NEXT: li a3, 15 +; ZVFHMIN32-NEXT: vs2r.v v10, (a2) +; ZVFHMIN32-NEXT: bltu a1, a3, .LBB141_2 +; ZVFHMIN32-NEXT: # %bb.1: +; ZVFHMIN32-NEXT: li a1, 15 +; ZVFHMIN32-NEXT: .LBB141_2: +; ZVFHMIN32-NEXT: slli a1, a1, 1 +; ZVFHMIN32-NEXT: add a0, a0, a1 +; ZVFHMIN32-NEXT: vl2re16.v v8, (a0) +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: slli a0, a0, 2 +; ZVFHMIN32-NEXT: add sp, sp, a0 +; ZVFHMIN32-NEXT: addi sp, sp, 16 +; ZVFHMIN32-NEXT: ret +; +; ZVFH32-LABEL: splice_nxv8f16_offset_max: +; ZVFH32: # %bb.0: +; ZVFH32-NEXT: csrr a0, vlenb +; ZVFH32-NEXT: addi a0, a0, -15 +; ZVFH32-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; ZVFH32-NEXT: vslidedown.vi v8, v8, 15 +; ZVFH32-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; ZVFH32-NEXT: vslideup.vx v8, v10, a0 +; ZVFH32-NEXT: ret +; +; ZVFH64-LABEL: splice_nxv8f16_offset_max: +; ZVFH64: # %bb.0: +; ZVFH64-NEXT: csrr a0, vlenb +; ZVFH64-NEXT: addi a0, a0, -15 +; ZVFH64-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; ZVFH64-NEXT: vslidedown.vi v8, v8, 15 +; ZVFH64-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; ZVFH64-NEXT: vslideup.vx v8, v10, a0 +; ZVFH64-NEXT: ret %res = call @llvm.vector.splice.nxv8f16( %a, %b, i32 15) ret %res } @@ -1758,47 +3712,275 @@ define @splice_nxv16f16_offset_zero( %a } define @splice_nxv16f16_offset_negone( %a, %b) #0 { -; CHECK-LABEL: splice_nxv16f16_offset_negone: -; CHECK: # %bb.0: -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 1 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: vsetivli zero, 1, e16, m4, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; CHECK-NEXT: vslideup.vi v8, v12, 1 -; CHECK-NEXT: ret +; ZVFHMIN64-LABEL: splice_nxv16f16_offset_negone: +; ZVFHMIN64: # %bb.0: +; ZVFHMIN64-NEXT: addi sp, sp, -48 +; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 48 +; ZVFHMIN64-NEXT: sw ra, 44(sp) # 4-byte Folded Spill +; ZVFHMIN64-NEXT: sw s0, 40(sp) # 4-byte Folded Spill +; ZVFHMIN64-NEXT: .cfi_offset ra, -4 +; ZVFHMIN64-NEXT: .cfi_offset s0, -8 +; ZVFHMIN64-NEXT: addi s0, sp, 48 +; ZVFHMIN64-NEXT: .cfi_def_cfa s0, 0 +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: slli a0, a0, 3 +; ZVFHMIN64-NEXT: sub sp, sp, a0 +; ZVFHMIN64-NEXT: andi sp, sp, -32 +; ZVFHMIN64-NEXT: addi a0, sp, 32 +; ZVFHMIN64-NEXT: vs4r.v v8, (a0) +; ZVFHMIN64-NEXT: csrr a1, vlenb +; ZVFHMIN64-NEXT: slli a1, a1, 2 +; ZVFHMIN64-NEXT: add a0, a0, a1 +; ZVFHMIN64-NEXT: vs4r.v v12, (a0) +; ZVFHMIN64-NEXT: addi a0, a0, -2 +; ZVFHMIN64-NEXT: vl4re16.v v8, (a0) +; ZVFHMIN64-NEXT: addi sp, s0, -48 +; ZVFHMIN64-NEXT: lw ra, 44(sp) # 4-byte Folded Reload +; ZVFHMIN64-NEXT: lw s0, 40(sp) # 4-byte Folded Reload +; ZVFHMIN64-NEXT: addi sp, sp, 48 +; ZVFHMIN64-NEXT: ret +; +; ZVFHMIN32-LABEL: splice_nxv16f16_offset_negone: +; ZVFHMIN32: # %bb.0: +; ZVFHMIN32-NEXT: addi sp, sp, -48 +; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 48 +; ZVFHMIN32-NEXT: sd ra, 40(sp) # 8-byte Folded Spill +; ZVFHMIN32-NEXT: sd s0, 32(sp) # 8-byte Folded Spill +; ZVFHMIN32-NEXT: .cfi_offset ra, -8 +; ZVFHMIN32-NEXT: .cfi_offset s0, -16 +; ZVFHMIN32-NEXT: addi s0, sp, 48 +; ZVFHMIN32-NEXT: .cfi_def_cfa s0, 0 +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: slli a0, a0, 3 +; ZVFHMIN32-NEXT: sub sp, sp, a0 +; ZVFHMIN32-NEXT: andi sp, sp, -32 +; ZVFHMIN32-NEXT: addi a0, sp, 32 +; ZVFHMIN32-NEXT: vs4r.v v8, (a0) +; ZVFHMIN32-NEXT: csrr a1, vlenb +; ZVFHMIN32-NEXT: slli a1, a1, 2 +; ZVFHMIN32-NEXT: add a0, a0, a1 +; ZVFHMIN32-NEXT: vs4r.v v12, (a0) +; ZVFHMIN32-NEXT: addi a0, a0, -2 +; ZVFHMIN32-NEXT: vl4re16.v v8, (a0) +; ZVFHMIN32-NEXT: addi sp, s0, -48 +; ZVFHMIN32-NEXT: ld ra, 40(sp) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: ld s0, 32(sp) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: addi sp, sp, 48 +; ZVFHMIN32-NEXT: ret +; +; ZVFH32-LABEL: splice_nxv16f16_offset_negone: +; ZVFH32: # %bb.0: +; ZVFH32-NEXT: csrr a0, vlenb +; ZVFH32-NEXT: slli a0, a0, 1 +; ZVFH32-NEXT: addi a0, a0, -1 +; ZVFH32-NEXT: vsetivli zero, 1, e16, m4, ta, ma +; ZVFH32-NEXT: vslidedown.vx v8, v8, a0 +; ZVFH32-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; ZVFH32-NEXT: vslideup.vi v8, v12, 1 +; ZVFH32-NEXT: ret +; +; ZVFH64-LABEL: splice_nxv16f16_offset_negone: +; ZVFH64: # %bb.0: +; ZVFH64-NEXT: csrr a0, vlenb +; ZVFH64-NEXT: slli a0, a0, 1 +; ZVFH64-NEXT: addi a0, a0, -1 +; ZVFH64-NEXT: vsetivli zero, 1, e16, m4, ta, ma +; ZVFH64-NEXT: vslidedown.vx v8, v8, a0 +; ZVFH64-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; ZVFH64-NEXT: vslideup.vi v8, v12, 1 +; ZVFH64-NEXT: ret %res = call @llvm.vector.splice.nxv16f16( %a, %b, i32 -1) ret %res } define @splice_nxv16f16_offset_min( %a, %b) #0 { -; CHECK-LABEL: splice_nxv16f16_offset_min: -; CHECK: # %bb.0: -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 1 -; CHECK-NEXT: addi a0, a0, -32 -; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; CHECK-NEXT: vslideup.vx v8, v12, a1 -; CHECK-NEXT: ret +; ZVFHMIN64-LABEL: splice_nxv16f16_offset_min: +; ZVFHMIN64: # %bb.0: +; ZVFHMIN64-NEXT: addi sp, sp, -48 +; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 48 +; ZVFHMIN64-NEXT: sw ra, 44(sp) # 4-byte Folded Spill +; ZVFHMIN64-NEXT: sw s0, 40(sp) # 4-byte Folded Spill +; ZVFHMIN64-NEXT: .cfi_offset ra, -4 +; ZVFHMIN64-NEXT: .cfi_offset s0, -8 +; ZVFHMIN64-NEXT: addi s0, sp, 48 +; ZVFHMIN64-NEXT: .cfi_def_cfa s0, 0 +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: slli a0, a0, 3 +; ZVFHMIN64-NEXT: sub sp, sp, a0 +; ZVFHMIN64-NEXT: andi sp, sp, -32 +; ZVFHMIN64-NEXT: addi a0, sp, 32 +; ZVFHMIN64-NEXT: vs4r.v v8, (a0) +; ZVFHMIN64-NEXT: csrr a1, vlenb +; ZVFHMIN64-NEXT: slli a1, a1, 2 +; ZVFHMIN64-NEXT: add a0, a0, a1 +; ZVFHMIN64-NEXT: li a2, 64 +; ZVFHMIN64-NEXT: vs4r.v v12, (a0) +; ZVFHMIN64-NEXT: bltu a1, a2, .LBB144_2 +; ZVFHMIN64-NEXT: # %bb.1: +; ZVFHMIN64-NEXT: li a1, 64 +; ZVFHMIN64-NEXT: .LBB144_2: +; ZVFHMIN64-NEXT: sub a0, a0, a1 +; ZVFHMIN64-NEXT: vl4re16.v v8, (a0) +; ZVFHMIN64-NEXT: addi sp, s0, -48 +; ZVFHMIN64-NEXT: lw ra, 44(sp) # 4-byte Folded Reload +; ZVFHMIN64-NEXT: lw s0, 40(sp) # 4-byte Folded Reload +; ZVFHMIN64-NEXT: addi sp, sp, 48 +; ZVFHMIN64-NEXT: ret +; +; ZVFHMIN32-LABEL: splice_nxv16f16_offset_min: +; ZVFHMIN32: # %bb.0: +; ZVFHMIN32-NEXT: addi sp, sp, -48 +; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 48 +; ZVFHMIN32-NEXT: sd ra, 40(sp) # 8-byte Folded Spill +; ZVFHMIN32-NEXT: sd s0, 32(sp) # 8-byte Folded Spill +; ZVFHMIN32-NEXT: .cfi_offset ra, -8 +; ZVFHMIN32-NEXT: .cfi_offset s0, -16 +; ZVFHMIN32-NEXT: addi s0, sp, 48 +; ZVFHMIN32-NEXT: .cfi_def_cfa s0, 0 +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: slli a0, a0, 3 +; ZVFHMIN32-NEXT: sub sp, sp, a0 +; ZVFHMIN32-NEXT: andi sp, sp, -32 +; ZVFHMIN32-NEXT: addi a0, sp, 32 +; ZVFHMIN32-NEXT: vs4r.v v8, (a0) +; ZVFHMIN32-NEXT: csrr a1, vlenb +; ZVFHMIN32-NEXT: slli a1, a1, 2 +; ZVFHMIN32-NEXT: add a0, a0, a1 +; ZVFHMIN32-NEXT: li a2, 64 +; ZVFHMIN32-NEXT: vs4r.v v12, (a0) +; ZVFHMIN32-NEXT: bltu a1, a2, .LBB144_2 +; ZVFHMIN32-NEXT: # %bb.1: +; ZVFHMIN32-NEXT: li a1, 64 +; ZVFHMIN32-NEXT: .LBB144_2: +; ZVFHMIN32-NEXT: sub a0, a0, a1 +; ZVFHMIN32-NEXT: vl4re16.v v8, (a0) +; ZVFHMIN32-NEXT: addi sp, s0, -48 +; ZVFHMIN32-NEXT: ld ra, 40(sp) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: ld s0, 32(sp) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: addi sp, sp, 48 +; ZVFHMIN32-NEXT: ret +; +; ZVFH32-LABEL: splice_nxv16f16_offset_min: +; ZVFH32: # %bb.0: +; ZVFH32-NEXT: csrr a0, vlenb +; ZVFH32-NEXT: slli a0, a0, 1 +; ZVFH32-NEXT: addi a0, a0, -32 +; ZVFH32-NEXT: li a1, 32 +; ZVFH32-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; ZVFH32-NEXT: vslidedown.vx v8, v8, a0 +; ZVFH32-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; ZVFH32-NEXT: vslideup.vx v8, v12, a1 +; ZVFH32-NEXT: ret +; +; ZVFH64-LABEL: splice_nxv16f16_offset_min: +; ZVFH64: # %bb.0: +; ZVFH64-NEXT: csrr a0, vlenb +; ZVFH64-NEXT: slli a0, a0, 1 +; ZVFH64-NEXT: addi a0, a0, -32 +; ZVFH64-NEXT: li a1, 32 +; ZVFH64-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; ZVFH64-NEXT: vslidedown.vx v8, v8, a0 +; ZVFH64-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; ZVFH64-NEXT: vslideup.vx v8, v12, a1 +; ZVFH64-NEXT: ret %res = call @llvm.vector.splice.nxv16f16( %a, %b, i32 -32) ret %res } define @splice_nxv16f16_offset_max( %a, %b) #0 { -; CHECK-LABEL: splice_nxv16f16_offset_max: -; CHECK: # %bb.0: -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 1 -; CHECK-NEXT: addi a0, a0, -31 -; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v8, 31 -; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma -; CHECK-NEXT: vslideup.vx v8, v12, a0 -; CHECK-NEXT: ret +; ZVFHMIN64-LABEL: splice_nxv16f16_offset_max: +; ZVFHMIN64: # %bb.0: +; ZVFHMIN64-NEXT: addi sp, sp, -48 +; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 48 +; ZVFHMIN64-NEXT: sw ra, 44(sp) # 4-byte Folded Spill +; ZVFHMIN64-NEXT: sw s0, 40(sp) # 4-byte Folded Spill +; ZVFHMIN64-NEXT: .cfi_offset ra, -4 +; ZVFHMIN64-NEXT: .cfi_offset s0, -8 +; ZVFHMIN64-NEXT: addi s0, sp, 48 +; ZVFHMIN64-NEXT: .cfi_def_cfa s0, 0 +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: slli a0, a0, 3 +; ZVFHMIN64-NEXT: sub sp, sp, a0 +; ZVFHMIN64-NEXT: andi sp, sp, -32 +; ZVFHMIN64-NEXT: addi a0, sp, 32 +; ZVFHMIN64-NEXT: vs4r.v v8, (a0) +; ZVFHMIN64-NEXT: csrr a1, vlenb +; ZVFHMIN64-NEXT: slli a2, a1, 2 +; ZVFHMIN64-NEXT: add a2, a0, a2 +; ZVFHMIN64-NEXT: slli a1, a1, 1 +; ZVFHMIN64-NEXT: addi a1, a1, -1 +; ZVFHMIN64-NEXT: li a3, 31 +; ZVFHMIN64-NEXT: vs4r.v v12, (a2) +; ZVFHMIN64-NEXT: bltu a1, a3, .LBB145_2 +; ZVFHMIN64-NEXT: # %bb.1: +; ZVFHMIN64-NEXT: li a1, 31 +; ZVFHMIN64-NEXT: .LBB145_2: +; ZVFHMIN64-NEXT: slli a1, a1, 1 +; ZVFHMIN64-NEXT: add a0, a0, a1 +; ZVFHMIN64-NEXT: vl4re16.v v8, (a0) +; ZVFHMIN64-NEXT: addi sp, s0, -48 +; ZVFHMIN64-NEXT: lw ra, 44(sp) # 4-byte Folded Reload +; ZVFHMIN64-NEXT: lw s0, 40(sp) # 4-byte Folded Reload +; ZVFHMIN64-NEXT: addi sp, sp, 48 +; ZVFHMIN64-NEXT: ret +; +; ZVFHMIN32-LABEL: splice_nxv16f16_offset_max: +; ZVFHMIN32: # %bb.0: +; ZVFHMIN32-NEXT: addi sp, sp, -48 +; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 48 +; ZVFHMIN32-NEXT: sd ra, 40(sp) # 8-byte Folded Spill +; ZVFHMIN32-NEXT: sd s0, 32(sp) # 8-byte Folded Spill +; ZVFHMIN32-NEXT: .cfi_offset ra, -8 +; ZVFHMIN32-NEXT: .cfi_offset s0, -16 +; ZVFHMIN32-NEXT: addi s0, sp, 48 +; ZVFHMIN32-NEXT: .cfi_def_cfa s0, 0 +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: slli a0, a0, 3 +; ZVFHMIN32-NEXT: sub sp, sp, a0 +; ZVFHMIN32-NEXT: andi sp, sp, -32 +; ZVFHMIN32-NEXT: addi a0, sp, 32 +; ZVFHMIN32-NEXT: vs4r.v v8, (a0) +; ZVFHMIN32-NEXT: csrr a1, vlenb +; ZVFHMIN32-NEXT: slli a2, a1, 2 +; ZVFHMIN32-NEXT: add a2, a0, a2 +; ZVFHMIN32-NEXT: slli a1, a1, 1 +; ZVFHMIN32-NEXT: addi a1, a1, -1 +; ZVFHMIN32-NEXT: li a3, 31 +; ZVFHMIN32-NEXT: vs4r.v v12, (a2) +; ZVFHMIN32-NEXT: bltu a1, a3, .LBB145_2 +; ZVFHMIN32-NEXT: # %bb.1: +; ZVFHMIN32-NEXT: li a1, 31 +; ZVFHMIN32-NEXT: .LBB145_2: +; ZVFHMIN32-NEXT: slli a1, a1, 1 +; ZVFHMIN32-NEXT: add a0, a0, a1 +; ZVFHMIN32-NEXT: vl4re16.v v8, (a0) +; ZVFHMIN32-NEXT: addi sp, s0, -48 +; ZVFHMIN32-NEXT: ld ra, 40(sp) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: ld s0, 32(sp) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: addi sp, sp, 48 +; ZVFHMIN32-NEXT: ret +; +; ZVFH32-LABEL: splice_nxv16f16_offset_max: +; ZVFH32: # %bb.0: +; ZVFH32-NEXT: csrr a0, vlenb +; ZVFH32-NEXT: slli a0, a0, 1 +; ZVFH32-NEXT: addi a0, a0, -31 +; ZVFH32-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; ZVFH32-NEXT: vslidedown.vi v8, v8, 31 +; ZVFH32-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; ZVFH32-NEXT: vslideup.vx v8, v12, a0 +; ZVFH32-NEXT: ret +; +; ZVFH64-LABEL: splice_nxv16f16_offset_max: +; ZVFH64: # %bb.0: +; ZVFH64-NEXT: csrr a0, vlenb +; ZVFH64-NEXT: slli a0, a0, 1 +; ZVFH64-NEXT: addi a0, a0, -31 +; ZVFH64-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; ZVFH64-NEXT: vslidedown.vi v8, v8, 31 +; ZVFH64-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; ZVFH64-NEXT: vslideup.vx v8, v12, a0 +; ZVFH64-NEXT: ret %res = call @llvm.vector.splice.nxv16f16( %a, %b, i32 31) ret %res } @@ -1814,48 +3996,277 @@ define @splice_nxv32f16_offset_zero( %a } define @splice_nxv32f16_offset_negone( %a, %b) #0 { -; CHECK-LABEL: splice_nxv32f16_offset_negone: -; CHECK: # %bb.0: -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 2 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: vsetivli zero, 1, e16, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, ma -; CHECK-NEXT: vslideup.vi v8, v16, 1 -; CHECK-NEXT: ret +; ZVFHMIN64-LABEL: splice_nxv32f16_offset_negone: +; ZVFHMIN64: # %bb.0: +; ZVFHMIN64-NEXT: addi sp, sp, -80 +; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 80 +; ZVFHMIN64-NEXT: sw ra, 76(sp) # 4-byte Folded Spill +; ZVFHMIN64-NEXT: sw s0, 72(sp) # 4-byte Folded Spill +; ZVFHMIN64-NEXT: .cfi_offset ra, -4 +; ZVFHMIN64-NEXT: .cfi_offset s0, -8 +; ZVFHMIN64-NEXT: addi s0, sp, 80 +; ZVFHMIN64-NEXT: .cfi_def_cfa s0, 0 +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: slli a0, a0, 4 +; ZVFHMIN64-NEXT: sub sp, sp, a0 +; ZVFHMIN64-NEXT: andi sp, sp, -64 +; ZVFHMIN64-NEXT: addi a0, sp, 64 +; ZVFHMIN64-NEXT: vs8r.v v8, (a0) +; ZVFHMIN64-NEXT: csrr a1, vlenb +; ZVFHMIN64-NEXT: slli a1, a1, 3 +; ZVFHMIN64-NEXT: add a0, a0, a1 +; ZVFHMIN64-NEXT: vs8r.v v16, (a0) +; ZVFHMIN64-NEXT: addi a0, a0, -2 +; ZVFHMIN64-NEXT: vl8re16.v v8, (a0) +; ZVFHMIN64-NEXT: addi sp, s0, -80 +; ZVFHMIN64-NEXT: lw ra, 76(sp) # 4-byte Folded Reload +; ZVFHMIN64-NEXT: lw s0, 72(sp) # 4-byte Folded Reload +; ZVFHMIN64-NEXT: addi sp, sp, 80 +; ZVFHMIN64-NEXT: ret +; +; ZVFHMIN32-LABEL: splice_nxv32f16_offset_negone: +; ZVFHMIN32: # %bb.0: +; ZVFHMIN32-NEXT: addi sp, sp, -80 +; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 80 +; ZVFHMIN32-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; ZVFHMIN32-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; ZVFHMIN32-NEXT: .cfi_offset ra, -8 +; ZVFHMIN32-NEXT: .cfi_offset s0, -16 +; ZVFHMIN32-NEXT: addi s0, sp, 80 +; ZVFHMIN32-NEXT: .cfi_def_cfa s0, 0 +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: slli a0, a0, 4 +; ZVFHMIN32-NEXT: sub sp, sp, a0 +; ZVFHMIN32-NEXT: andi sp, sp, -64 +; ZVFHMIN32-NEXT: addi a0, sp, 64 +; ZVFHMIN32-NEXT: vs8r.v v8, (a0) +; ZVFHMIN32-NEXT: csrr a1, vlenb +; ZVFHMIN32-NEXT: slli a1, a1, 3 +; ZVFHMIN32-NEXT: add a0, a0, a1 +; ZVFHMIN32-NEXT: vs8r.v v16, (a0) +; ZVFHMIN32-NEXT: addi a0, a0, -2 +; ZVFHMIN32-NEXT: vl8re16.v v8, (a0) +; ZVFHMIN32-NEXT: addi sp, s0, -80 +; ZVFHMIN32-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: addi sp, sp, 80 +; ZVFHMIN32-NEXT: ret +; +; ZVFH32-LABEL: splice_nxv32f16_offset_negone: +; ZVFH32: # %bb.0: +; ZVFH32-NEXT: csrr a0, vlenb +; ZVFH32-NEXT: slli a0, a0, 2 +; ZVFH32-NEXT: addi a0, a0, -1 +; ZVFH32-NEXT: vsetivli zero, 1, e16, m8, ta, ma +; ZVFH32-NEXT: vslidedown.vx v8, v8, a0 +; ZVFH32-NEXT: vsetvli a0, zero, e16, m8, ta, ma +; ZVFH32-NEXT: vslideup.vi v8, v16, 1 +; ZVFH32-NEXT: ret +; +; ZVFH64-LABEL: splice_nxv32f16_offset_negone: +; ZVFH64: # %bb.0: +; ZVFH64-NEXT: csrr a0, vlenb +; ZVFH64-NEXT: slli a0, a0, 2 +; ZVFH64-NEXT: addi a0, a0, -1 +; ZVFH64-NEXT: vsetivli zero, 1, e16, m8, ta, ma +; ZVFH64-NEXT: vslidedown.vx v8, v8, a0 +; ZVFH64-NEXT: vsetvli a0, zero, e16, m8, ta, ma +; ZVFH64-NEXT: vslideup.vi v8, v16, 1 +; ZVFH64-NEXT: ret %res = call @llvm.vector.splice.nxv32f16( %a, %b, i32 -1) ret %res } define @splice_nxv32f16_offset_min( %a, %b) #0 { -; CHECK-LABEL: splice_nxv32f16_offset_min: -; CHECK: # %bb.0: -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 2 -; CHECK-NEXT: addi a0, a0, -64 -; CHECK-NEXT: li a1, 64 -; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, ma -; CHECK-NEXT: vslideup.vx v8, v16, a1 -; CHECK-NEXT: ret +; ZVFHMIN64-LABEL: splice_nxv32f16_offset_min: +; ZVFHMIN64: # %bb.0: +; ZVFHMIN64-NEXT: addi sp, sp, -80 +; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 80 +; ZVFHMIN64-NEXT: sw ra, 76(sp) # 4-byte Folded Spill +; ZVFHMIN64-NEXT: sw s0, 72(sp) # 4-byte Folded Spill +; ZVFHMIN64-NEXT: .cfi_offset ra, -4 +; ZVFHMIN64-NEXT: .cfi_offset s0, -8 +; ZVFHMIN64-NEXT: addi s0, sp, 80 +; ZVFHMIN64-NEXT: .cfi_def_cfa s0, 0 +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: slli a0, a0, 4 +; ZVFHMIN64-NEXT: sub sp, sp, a0 +; ZVFHMIN64-NEXT: andi sp, sp, -64 +; ZVFHMIN64-NEXT: addi a0, sp, 64 +; ZVFHMIN64-NEXT: vs8r.v v8, (a0) +; ZVFHMIN64-NEXT: csrr a1, vlenb +; ZVFHMIN64-NEXT: slli a1, a1, 3 +; ZVFHMIN64-NEXT: add a0, a0, a1 +; ZVFHMIN64-NEXT: li a2, 128 +; ZVFHMIN64-NEXT: vs8r.v v16, (a0) +; ZVFHMIN64-NEXT: bltu a1, a2, .LBB148_2 +; ZVFHMIN64-NEXT: # %bb.1: +; ZVFHMIN64-NEXT: li a1, 128 +; ZVFHMIN64-NEXT: .LBB148_2: +; ZVFHMIN64-NEXT: sub a0, a0, a1 +; ZVFHMIN64-NEXT: vl8re16.v v8, (a0) +; ZVFHMIN64-NEXT: addi sp, s0, -80 +; ZVFHMIN64-NEXT: lw ra, 76(sp) # 4-byte Folded Reload +; ZVFHMIN64-NEXT: lw s0, 72(sp) # 4-byte Folded Reload +; ZVFHMIN64-NEXT: addi sp, sp, 80 +; ZVFHMIN64-NEXT: ret +; +; ZVFHMIN32-LABEL: splice_nxv32f16_offset_min: +; ZVFHMIN32: # %bb.0: +; ZVFHMIN32-NEXT: addi sp, sp, -80 +; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 80 +; ZVFHMIN32-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; ZVFHMIN32-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; ZVFHMIN32-NEXT: .cfi_offset ra, -8 +; ZVFHMIN32-NEXT: .cfi_offset s0, -16 +; ZVFHMIN32-NEXT: addi s0, sp, 80 +; ZVFHMIN32-NEXT: .cfi_def_cfa s0, 0 +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: slli a0, a0, 4 +; ZVFHMIN32-NEXT: sub sp, sp, a0 +; ZVFHMIN32-NEXT: andi sp, sp, -64 +; ZVFHMIN32-NEXT: addi a0, sp, 64 +; ZVFHMIN32-NEXT: vs8r.v v8, (a0) +; ZVFHMIN32-NEXT: csrr a1, vlenb +; ZVFHMIN32-NEXT: slli a1, a1, 3 +; ZVFHMIN32-NEXT: add a0, a0, a1 +; ZVFHMIN32-NEXT: li a2, 128 +; ZVFHMIN32-NEXT: vs8r.v v16, (a0) +; ZVFHMIN32-NEXT: bltu a1, a2, .LBB148_2 +; ZVFHMIN32-NEXT: # %bb.1: +; ZVFHMIN32-NEXT: li a1, 128 +; ZVFHMIN32-NEXT: .LBB148_2: +; ZVFHMIN32-NEXT: sub a0, a0, a1 +; ZVFHMIN32-NEXT: vl8re16.v v8, (a0) +; ZVFHMIN32-NEXT: addi sp, s0, -80 +; ZVFHMIN32-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: addi sp, sp, 80 +; ZVFHMIN32-NEXT: ret +; +; ZVFH32-LABEL: splice_nxv32f16_offset_min: +; ZVFH32: # %bb.0: +; ZVFH32-NEXT: csrr a0, vlenb +; ZVFH32-NEXT: slli a0, a0, 2 +; ZVFH32-NEXT: addi a0, a0, -64 +; ZVFH32-NEXT: li a1, 64 +; ZVFH32-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; ZVFH32-NEXT: vslidedown.vx v8, v8, a0 +; ZVFH32-NEXT: vsetvli a0, zero, e16, m8, ta, ma +; ZVFH32-NEXT: vslideup.vx v8, v16, a1 +; ZVFH32-NEXT: ret +; +; ZVFH64-LABEL: splice_nxv32f16_offset_min: +; ZVFH64: # %bb.0: +; ZVFH64-NEXT: csrr a0, vlenb +; ZVFH64-NEXT: slli a0, a0, 2 +; ZVFH64-NEXT: addi a0, a0, -64 +; ZVFH64-NEXT: li a1, 64 +; ZVFH64-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; ZVFH64-NEXT: vslidedown.vx v8, v8, a0 +; ZVFH64-NEXT: vsetvli a0, zero, e16, m8, ta, ma +; ZVFH64-NEXT: vslideup.vx v8, v16, a1 +; ZVFH64-NEXT: ret %res = call @llvm.vector.splice.nxv32f16( %a, %b, i32 -64) ret %res } define @splice_nxv32f16_offset_max( %a, %b) #0 { -; CHECK-LABEL: splice_nxv32f16_offset_max: -; CHECK: # %bb.0: -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 2 -; CHECK-NEXT: addi a0, a0, -63 -; CHECK-NEXT: li a1, 63 -; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v8, a1 -; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; CHECK-NEXT: vslideup.vx v8, v16, a0 -; CHECK-NEXT: ret +; ZVFHMIN64-LABEL: splice_nxv32f16_offset_max: +; ZVFHMIN64: # %bb.0: +; ZVFHMIN64-NEXT: addi sp, sp, -80 +; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 80 +; ZVFHMIN64-NEXT: sw ra, 76(sp) # 4-byte Folded Spill +; ZVFHMIN64-NEXT: sw s0, 72(sp) # 4-byte Folded Spill +; ZVFHMIN64-NEXT: .cfi_offset ra, -4 +; ZVFHMIN64-NEXT: .cfi_offset s0, -8 +; ZVFHMIN64-NEXT: addi s0, sp, 80 +; ZVFHMIN64-NEXT: .cfi_def_cfa s0, 0 +; ZVFHMIN64-NEXT: csrr a0, vlenb +; ZVFHMIN64-NEXT: slli a0, a0, 4 +; ZVFHMIN64-NEXT: sub sp, sp, a0 +; ZVFHMIN64-NEXT: andi sp, sp, -64 +; ZVFHMIN64-NEXT: addi a0, sp, 64 +; ZVFHMIN64-NEXT: vs8r.v v8, (a0) +; ZVFHMIN64-NEXT: csrr a1, vlenb +; ZVFHMIN64-NEXT: slli a2, a1, 3 +; ZVFHMIN64-NEXT: add a2, a0, a2 +; ZVFHMIN64-NEXT: slli a1, a1, 2 +; ZVFHMIN64-NEXT: addi a1, a1, -1 +; ZVFHMIN64-NEXT: li a3, 63 +; ZVFHMIN64-NEXT: vs8r.v v16, (a2) +; ZVFHMIN64-NEXT: bltu a1, a3, .LBB149_2 +; ZVFHMIN64-NEXT: # %bb.1: +; ZVFHMIN64-NEXT: li a1, 63 +; ZVFHMIN64-NEXT: .LBB149_2: +; ZVFHMIN64-NEXT: slli a1, a1, 1 +; ZVFHMIN64-NEXT: add a0, a0, a1 +; ZVFHMIN64-NEXT: vl8re16.v v8, (a0) +; ZVFHMIN64-NEXT: addi sp, s0, -80 +; ZVFHMIN64-NEXT: lw ra, 76(sp) # 4-byte Folded Reload +; ZVFHMIN64-NEXT: lw s0, 72(sp) # 4-byte Folded Reload +; ZVFHMIN64-NEXT: addi sp, sp, 80 +; ZVFHMIN64-NEXT: ret +; +; ZVFHMIN32-LABEL: splice_nxv32f16_offset_max: +; ZVFHMIN32: # %bb.0: +; ZVFHMIN32-NEXT: addi sp, sp, -80 +; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 80 +; ZVFHMIN32-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; ZVFHMIN32-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; ZVFHMIN32-NEXT: .cfi_offset ra, -8 +; ZVFHMIN32-NEXT: .cfi_offset s0, -16 +; ZVFHMIN32-NEXT: addi s0, sp, 80 +; ZVFHMIN32-NEXT: .cfi_def_cfa s0, 0 +; ZVFHMIN32-NEXT: csrr a0, vlenb +; ZVFHMIN32-NEXT: slli a0, a0, 4 +; ZVFHMIN32-NEXT: sub sp, sp, a0 +; ZVFHMIN32-NEXT: andi sp, sp, -64 +; ZVFHMIN32-NEXT: addi a0, sp, 64 +; ZVFHMIN32-NEXT: vs8r.v v8, (a0) +; ZVFHMIN32-NEXT: csrr a1, vlenb +; ZVFHMIN32-NEXT: slli a2, a1, 3 +; ZVFHMIN32-NEXT: add a2, a0, a2 +; ZVFHMIN32-NEXT: slli a1, a1, 2 +; ZVFHMIN32-NEXT: addi a1, a1, -1 +; ZVFHMIN32-NEXT: li a3, 63 +; ZVFHMIN32-NEXT: vs8r.v v16, (a2) +; ZVFHMIN32-NEXT: bltu a1, a3, .LBB149_2 +; ZVFHMIN32-NEXT: # %bb.1: +; ZVFHMIN32-NEXT: li a1, 63 +; ZVFHMIN32-NEXT: .LBB149_2: +; ZVFHMIN32-NEXT: slli a1, a1, 1 +; ZVFHMIN32-NEXT: add a0, a0, a1 +; ZVFHMIN32-NEXT: vl8re16.v v8, (a0) +; ZVFHMIN32-NEXT: addi sp, s0, -80 +; ZVFHMIN32-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; ZVFHMIN32-NEXT: addi sp, sp, 80 +; ZVFHMIN32-NEXT: ret +; +; ZVFH32-LABEL: splice_nxv32f16_offset_max: +; ZVFH32: # %bb.0: +; ZVFH32-NEXT: csrr a0, vlenb +; ZVFH32-NEXT: slli a0, a0, 2 +; ZVFH32-NEXT: addi a0, a0, -63 +; ZVFH32-NEXT: li a1, 63 +; ZVFH32-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; ZVFH32-NEXT: vslidedown.vx v8, v8, a1 +; ZVFH32-NEXT: vsetvli a1, zero, e16, m8, ta, ma +; ZVFH32-NEXT: vslideup.vx v8, v16, a0 +; ZVFH32-NEXT: ret +; +; ZVFH64-LABEL: splice_nxv32f16_offset_max: +; ZVFH64: # %bb.0: +; ZVFH64-NEXT: csrr a0, vlenb +; ZVFH64-NEXT: slli a0, a0, 2 +; ZVFH64-NEXT: addi a0, a0, -63 +; ZVFH64-NEXT: li a1, 63 +; ZVFH64-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; ZVFH64-NEXT: vslidedown.vx v8, v8, a1 +; ZVFH64-NEXT: vsetvli a1, zero, e16, m8, ta, ma +; ZVFH64-NEXT: vslideup.vx v8, v16, a0 +; ZVFH64-NEXT: ret %res = call @llvm.vector.splice.nxv32f16( %a, %b, i32 63) ret %res } From 118f8ba50b2ef493853badc1fbcdea1ea9fb5faa Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Wed, 16 Oct 2024 17:36:03 +0100 Subject: [PATCH 2/3] [RISCV] Lower vector_splice on zvfhmin/zvfbfmin Similar to other permutation ops, we can just reuse the existing lowering. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 2 +- llvm/test/CodeGen/RISCV/rvv/vector-splice.ll | 2694 ++---------------- 2 files changed, 307 insertions(+), 2389 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index bf333b7b79016..076ed173f64e2 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -1076,7 +1076,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction({ISD::INSERT_VECTOR_ELT, ISD::CONCAT_VECTORS, ISD::INSERT_SUBVECTOR, ISD::EXTRACT_SUBVECTOR, ISD::VECTOR_DEINTERLEAVE, ISD::VECTOR_INTERLEAVE, - ISD::VECTOR_REVERSE}, + ISD::VECTOR_REVERSE, ISD::VECTOR_SPLICE}, VT, Custom); MVT EltVT = VT.getVectorElementType(); if (isTypeLegal(EltVT)) diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll b/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll index 3f84f4549ce81..c9cb6dc6397c3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple riscv32 -mattr=+m,+f,+d,+v,+zvfhmin,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN64 -; RUN: llc -mtriple riscv64 -mattr=+m,+f,+d,+v,+zvfhmin,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN32 -; RUN: llc -mtriple riscv32 -mattr=+m,+f,+d,+v,+zvfh,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFH32 -; RUN: llc -mtriple riscv64 -mattr=+m,+f,+d,+v,+zvfh,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFH64 +; RUN: llc -mtriple riscv32 -mattr=+m,+f,+d,+v,+zvfhmin,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK +; RUN: llc -mtriple riscv64 -mattr=+m,+f,+d,+v,+zvfhmin,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK +; RUN: llc -mtriple riscv32 -mattr=+m,+f,+d,+v,+zvfh,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK +; RUN: llc -mtriple riscv64 -mattr=+m,+f,+d,+v,+zvfh,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK ; Tests assume VLEN=128 or vscale_range_min=2. @@ -1548,23 +1548,12 @@ define @splice_nxv1bf16_offset_zero( define @splice_nxv1bf16_offset_negone( %a, %b) #0 { ; CHECK-LABEL: splice_nxv1bf16_offset_negone: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb -; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma -; CHECK-NEXT: vse16.v v8, (a0) -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: srli a1, a1, 2 -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: vse16.v v9, (a0) -; CHECK-NEXT: addi a0, a0, -2 -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslideup.vi v8, v9, 1 ; CHECK-NEXT: ret %res = call @llvm.vector.splice.nxv1bf16( %a, %b, i32 -1) ret %res @@ -1573,28 +1562,13 @@ define @splice_nxv1bf16_offset_negone( @splice_nxv1bf16_offset_min( %a, %b) #0 { ; CHECK-LABEL: splice_nxv1bf16_offset_min: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma -; CHECK-NEXT: vse16.v v8, (a0) -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: srli a1, a1, 2 -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: li a2, 4 -; CHECK-NEXT: vse16.v v9, (a0) -; CHECK-NEXT: bltu a1, a2, .LBB104_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a1, 4 -; CHECK-NEXT: .LBB104_2: -; CHECK-NEXT: sub a0, a0, a1 -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: addi a0, a0, -2 +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vslideup.vi v8, v9, 2 ; CHECK-NEXT: ret %res = call @llvm.vector.splice.nxv1bf16( %a, %b, i32 -2) ret %res @@ -1603,31 +1577,13 @@ define @splice_nxv1bf16_offset_min( % define @splice_nxv1bf16_offset_max( %a, %b) #0 { ; CHECK-LABEL: splice_nxv1bf16_offset_max: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb -; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vslidedown.vi v8, v8, 1 ; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma -; CHECK-NEXT: vse16.v v8, (a0) -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: srli a2, a1, 2 -; CHECK-NEXT: add a2, a0, a2 -; CHECK-NEXT: srli a1, a1, 3 -; CHECK-NEXT: addi a1, a1, -1 -; CHECK-NEXT: li a3, 1 -; CHECK-NEXT: vse16.v v9, (a2) -; CHECK-NEXT: bltu a1, a3, .LBB105_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a1, 1 -; CHECK-NEXT: .LBB105_2: -; CHECK-NEXT: slli a1, a1, 1 -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: vslideup.vx v8, v9, a0 ; CHECK-NEXT: ret %res = call @llvm.vector.splice.nxv1bf16( %a, %b, i32 1) ret %res @@ -1646,23 +1602,12 @@ define @splice_nxv2bf16_offset_zero( define @splice_nxv2bf16_offset_negone( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2bf16_offset_negone: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb -; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma -; CHECK-NEXT: vse16.v v8, (a0) -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: srli a1, a1, 1 -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: vse16.v v9, (a0) -; CHECK-NEXT: addi a0, a0, -2 -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslideup.vi v8, v9, 1 ; CHECK-NEXT: ret %res = call @llvm.vector.splice.nxv2bf16( %a, %b, i32 -1) ret %res @@ -1671,28 +1616,13 @@ define @splice_nxv2bf16_offset_negone( @splice_nxv2bf16_offset_min( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2bf16_offset_min: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma -; CHECK-NEXT: vse16.v v8, (a0) -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: srli a1, a1, 1 -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: li a2, 8 -; CHECK-NEXT: vse16.v v9, (a0) -; CHECK-NEXT: bltu a1, a2, .LBB108_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a1, 8 -; CHECK-NEXT: .LBB108_2: -; CHECK-NEXT: sub a0, a0, a1 -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: addi a0, a0, -4 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v9, 4 ; CHECK-NEXT: ret %res = call @llvm.vector.splice.nxv2bf16( %a, %b, i32 -4) ret %res @@ -1701,31 +1631,13 @@ define @splice_nxv2bf16_offset_min( % define @splice_nxv2bf16_offset_max( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2bf16_offset_max: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb -; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: addi a0, a0, -3 +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vslidedown.vi v8, v8, 3 ; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma -; CHECK-NEXT: vse16.v v8, (a0) -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: srli a2, a1, 1 -; CHECK-NEXT: add a2, a0, a2 -; CHECK-NEXT: srli a1, a1, 2 -; CHECK-NEXT: addi a1, a1, -1 -; CHECK-NEXT: li a3, 3 -; CHECK-NEXT: vse16.v v9, (a2) -; CHECK-NEXT: bltu a1, a3, .LBB109_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a1, 3 -; CHECK-NEXT: .LBB109_2: -; CHECK-NEXT: slli a1, a1, 1 -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: vslideup.vx v8, v9, a0 ; CHECK-NEXT: ret %res = call @llvm.vector.splice.nxv2bf16( %a, %b, i32 3) ret %res @@ -1744,23 +1656,12 @@ define @splice_nxv4bf16_offset_zero( define @splice_nxv4bf16_offset_negone( %a, %b) #0 { ; CHECK-LABEL: splice_nxv4bf16_offset_negone: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 1 -; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs1r.v v8, (a0) -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: vs1r.v v9, (a0) -; CHECK-NEXT: addi a0, a0, -2 -; CHECK-NEXT: vl1re16.v v8, (a0) ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 1 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: srli a0, a0, 1 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslideup.vi v8, v9, 1 ; CHECK-NEXT: ret %res = call @llvm.vector.splice.nxv4bf16( %a, %b, i32 -1) ret %res @@ -1769,28 +1670,13 @@ define @splice_nxv4bf16_offset_negone( @splice_nxv4bf16_offset_min( %a, %b) #0 { ; CHECK-LABEL: splice_nxv4bf16_offset_min: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 1 -; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs1r.v v8, (a0) -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: li a2, 16 -; CHECK-NEXT: vs1r.v v9, (a0) -; CHECK-NEXT: bltu a1, a2, .LBB112_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: .LBB112_2: -; CHECK-NEXT: sub a0, a0, a1 -; CHECK-NEXT: vl1re16.v v8, (a0) ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 1 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: srli a0, a0, 1 +; CHECK-NEXT: addi a0, a0, -8 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vi v8, v9, 8 ; CHECK-NEXT: ret %res = call @llvm.vector.splice.nxv4bf16( %a, %b, i32 -8) ret %res @@ -1799,31 +1685,13 @@ define @splice_nxv4bf16_offset_min( % define @splice_nxv4bf16_offset_max( %a, %b) #0 { ; CHECK-LABEL: splice_nxv4bf16_offset_max: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 1 -; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs1r.v v8, (a0) -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: add a2, a0, a1 -; CHECK-NEXT: srli a1, a1, 1 -; CHECK-NEXT: addi a1, a1, -1 -; CHECK-NEXT: li a3, 7 -; CHECK-NEXT: vs1r.v v9, (a2) -; CHECK-NEXT: bltu a1, a3, .LBB113_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a1, 7 -; CHECK-NEXT: .LBB113_2: -; CHECK-NEXT: slli a1, a1, 1 -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: vl1re16.v v8, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 1 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: srli a0, a0, 1 +; CHECK-NEXT: addi a0, a0, -7 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vslidedown.vi v8, v8, 7 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vx v8, v9, a0 ; CHECK-NEXT: ret %res = call @llvm.vector.splice.nxv4bf16( %a, %b, i32 7) ret %res @@ -1842,24 +1710,12 @@ define @splice_nxv8bf16_offset_zero( define @splice_nxv8bf16_offset_negone( %a, %b) #0 { ; CHECK-LABEL: splice_nxv8bf16_offset_negone: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 2 -; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs2r.v v8, (a0) -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 1 -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: vs2r.v v10, (a0) -; CHECK-NEXT: addi a0, a0, -2 -; CHECK-NEXT: vl2re16.v v8, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 2 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetivli zero, 1, e16, m2, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v10, 1 ; CHECK-NEXT: ret %res = call @llvm.vector.splice.nxv8bf16( %a, %b, i32 -1) ret %res @@ -1868,29 +1724,12 @@ define @splice_nxv8bf16_offset_negone( @splice_nxv8bf16_offset_min( %a, %b) #0 { ; CHECK-LABEL: splice_nxv8bf16_offset_min: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 2 -; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs2r.v v8, (a0) -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 1 -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: li a2, 32 -; CHECK-NEXT: vs2r.v v10, (a0) -; CHECK-NEXT: bltu a1, a2, .LBB116_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: .LBB116_2: -; CHECK-NEXT: sub a0, a0, a1 -; CHECK-NEXT: vl2re16.v v8, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 2 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: addi a0, a0, -16 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v10, 16 ; CHECK-NEXT: ret %res = call @llvm.vector.splice.nxv8bf16( %a, %b, i32 -16) ret %res @@ -1899,31 +1738,12 @@ define @splice_nxv8bf16_offset_min( % define @splice_nxv8bf16_offset_max( %a, %b) #0 { ; CHECK-LABEL: splice_nxv8bf16_offset_max: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 2 -; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs2r.v v8, (a0) -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a2, a1, 1 -; CHECK-NEXT: add a2, a0, a2 -; CHECK-NEXT: addi a1, a1, -1 -; CHECK-NEXT: li a3, 15 -; CHECK-NEXT: vs2r.v v10, (a2) -; CHECK-NEXT: bltu a1, a3, .LBB117_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a1, 15 -; CHECK-NEXT: .LBB117_2: -; CHECK-NEXT: slli a1, a1, 1 -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: vl2re16.v v8, (a0) ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 2 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: addi a0, a0, -15 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vslidedown.vi v8, v8, 15 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vslideup.vx v8, v10, a0 ; CHECK-NEXT: ret %res = call @llvm.vector.splice.nxv8bf16( %a, %b, i32 15) ret %res @@ -1940,401 +1760,47 @@ define @splice_nxv16bf16_offset_zero( @splice_nxv16bf16_offset_negone( %a, %b) #0 { -; ZVFHMIN64-LABEL: splice_nxv16bf16_offset_negone: -; ZVFHMIN64: # %bb.0: -; ZVFHMIN64-NEXT: addi sp, sp, -48 -; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 48 -; ZVFHMIN64-NEXT: sw ra, 44(sp) # 4-byte Folded Spill -; ZVFHMIN64-NEXT: sw s0, 40(sp) # 4-byte Folded Spill -; ZVFHMIN64-NEXT: .cfi_offset ra, -4 -; ZVFHMIN64-NEXT: .cfi_offset s0, -8 -; ZVFHMIN64-NEXT: addi s0, sp, 48 -; ZVFHMIN64-NEXT: .cfi_def_cfa s0, 0 -; ZVFHMIN64-NEXT: csrr a0, vlenb -; ZVFHMIN64-NEXT: slli a0, a0, 3 -; ZVFHMIN64-NEXT: sub sp, sp, a0 -; ZVFHMIN64-NEXT: andi sp, sp, -32 -; ZVFHMIN64-NEXT: addi a0, sp, 32 -; ZVFHMIN64-NEXT: vs4r.v v8, (a0) -; ZVFHMIN64-NEXT: csrr a1, vlenb -; ZVFHMIN64-NEXT: slli a1, a1, 2 -; ZVFHMIN64-NEXT: add a0, a0, a1 -; ZVFHMIN64-NEXT: vs4r.v v12, (a0) -; ZVFHMIN64-NEXT: addi a0, a0, -2 -; ZVFHMIN64-NEXT: vl4re16.v v8, (a0) -; ZVFHMIN64-NEXT: addi sp, s0, -48 -; ZVFHMIN64-NEXT: lw ra, 44(sp) # 4-byte Folded Reload -; ZVFHMIN64-NEXT: lw s0, 40(sp) # 4-byte Folded Reload -; ZVFHMIN64-NEXT: addi sp, sp, 48 -; ZVFHMIN64-NEXT: ret -; -; ZVFHMIN32-LABEL: splice_nxv16bf16_offset_negone: -; ZVFHMIN32: # %bb.0: -; ZVFHMIN32-NEXT: addi sp, sp, -48 -; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 48 -; ZVFHMIN32-NEXT: sd ra, 40(sp) # 8-byte Folded Spill -; ZVFHMIN32-NEXT: sd s0, 32(sp) # 8-byte Folded Spill -; ZVFHMIN32-NEXT: .cfi_offset ra, -8 -; ZVFHMIN32-NEXT: .cfi_offset s0, -16 -; ZVFHMIN32-NEXT: addi s0, sp, 48 -; ZVFHMIN32-NEXT: .cfi_def_cfa s0, 0 -; ZVFHMIN32-NEXT: csrr a0, vlenb -; ZVFHMIN32-NEXT: slli a0, a0, 3 -; ZVFHMIN32-NEXT: sub sp, sp, a0 -; ZVFHMIN32-NEXT: andi sp, sp, -32 -; ZVFHMIN32-NEXT: addi a0, sp, 32 -; ZVFHMIN32-NEXT: vs4r.v v8, (a0) -; ZVFHMIN32-NEXT: csrr a1, vlenb -; ZVFHMIN32-NEXT: slli a1, a1, 2 -; ZVFHMIN32-NEXT: add a0, a0, a1 -; ZVFHMIN32-NEXT: vs4r.v v12, (a0) -; ZVFHMIN32-NEXT: addi a0, a0, -2 -; ZVFHMIN32-NEXT: vl4re16.v v8, (a0) -; ZVFHMIN32-NEXT: addi sp, s0, -48 -; ZVFHMIN32-NEXT: ld ra, 40(sp) # 8-byte Folded Reload -; ZVFHMIN32-NEXT: ld s0, 32(sp) # 8-byte Folded Reload -; ZVFHMIN32-NEXT: addi sp, sp, 48 -; ZVFHMIN32-NEXT: ret -; -; ZVFH32-LABEL: splice_nxv16bf16_offset_negone: -; ZVFH32: # %bb.0: -; ZVFH32-NEXT: addi sp, sp, -48 -; ZVFH32-NEXT: .cfi_def_cfa_offset 48 -; ZVFH32-NEXT: sw ra, 44(sp) # 4-byte Folded Spill -; ZVFH32-NEXT: sw s0, 40(sp) # 4-byte Folded Spill -; ZVFH32-NEXT: .cfi_offset ra, -4 -; ZVFH32-NEXT: .cfi_offset s0, -8 -; ZVFH32-NEXT: addi s0, sp, 48 -; ZVFH32-NEXT: .cfi_def_cfa s0, 0 -; ZVFH32-NEXT: csrr a0, vlenb -; ZVFH32-NEXT: slli a0, a0, 3 -; ZVFH32-NEXT: sub sp, sp, a0 -; ZVFH32-NEXT: andi sp, sp, -32 -; ZVFH32-NEXT: addi a0, sp, 32 -; ZVFH32-NEXT: vs4r.v v8, (a0) -; ZVFH32-NEXT: csrr a1, vlenb -; ZVFH32-NEXT: slli a1, a1, 2 -; ZVFH32-NEXT: add a0, a0, a1 -; ZVFH32-NEXT: vs4r.v v12, (a0) -; ZVFH32-NEXT: addi a0, a0, -2 -; ZVFH32-NEXT: vl4re16.v v8, (a0) -; ZVFH32-NEXT: addi sp, s0, -48 -; ZVFH32-NEXT: lw ra, 44(sp) # 4-byte Folded Reload -; ZVFH32-NEXT: lw s0, 40(sp) # 4-byte Folded Reload -; ZVFH32-NEXT: addi sp, sp, 48 -; ZVFH32-NEXT: ret -; -; ZVFH64-LABEL: splice_nxv16bf16_offset_negone: -; ZVFH64: # %bb.0: -; ZVFH64-NEXT: addi sp, sp, -48 -; ZVFH64-NEXT: .cfi_def_cfa_offset 48 -; ZVFH64-NEXT: sd ra, 40(sp) # 8-byte Folded Spill -; ZVFH64-NEXT: sd s0, 32(sp) # 8-byte Folded Spill -; ZVFH64-NEXT: .cfi_offset ra, -8 -; ZVFH64-NEXT: .cfi_offset s0, -16 -; ZVFH64-NEXT: addi s0, sp, 48 -; ZVFH64-NEXT: .cfi_def_cfa s0, 0 -; ZVFH64-NEXT: csrr a0, vlenb -; ZVFH64-NEXT: slli a0, a0, 3 -; ZVFH64-NEXT: sub sp, sp, a0 -; ZVFH64-NEXT: andi sp, sp, -32 -; ZVFH64-NEXT: addi a0, sp, 32 -; ZVFH64-NEXT: vs4r.v v8, (a0) -; ZVFH64-NEXT: csrr a1, vlenb -; ZVFH64-NEXT: slli a1, a1, 2 -; ZVFH64-NEXT: add a0, a0, a1 -; ZVFH64-NEXT: vs4r.v v12, (a0) -; ZVFH64-NEXT: addi a0, a0, -2 -; ZVFH64-NEXT: vl4re16.v v8, (a0) -; ZVFH64-NEXT: addi sp, s0, -48 -; ZVFH64-NEXT: ld ra, 40(sp) # 8-byte Folded Reload -; ZVFH64-NEXT: ld s0, 32(sp) # 8-byte Folded Reload -; ZVFH64-NEXT: addi sp, sp, 48 -; ZVFH64-NEXT: ret +; CHECK-LABEL: splice_nxv16bf16_offset_negone: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetivli zero, 1, e16, m4, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vslideup.vi v8, v12, 1 +; CHECK-NEXT: ret %res = call @llvm.vector.splice.nxv16bf16( %a, %b, i32 -1) ret %res } define @splice_nxv16bf16_offset_min( %a, %b) #0 { -; ZVFHMIN64-LABEL: splice_nxv16bf16_offset_min: -; ZVFHMIN64: # %bb.0: -; ZVFHMIN64-NEXT: addi sp, sp, -48 -; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 48 -; ZVFHMIN64-NEXT: sw ra, 44(sp) # 4-byte Folded Spill -; ZVFHMIN64-NEXT: sw s0, 40(sp) # 4-byte Folded Spill -; ZVFHMIN64-NEXT: .cfi_offset ra, -4 -; ZVFHMIN64-NEXT: .cfi_offset s0, -8 -; ZVFHMIN64-NEXT: addi s0, sp, 48 -; ZVFHMIN64-NEXT: .cfi_def_cfa s0, 0 -; ZVFHMIN64-NEXT: csrr a0, vlenb -; ZVFHMIN64-NEXT: slli a0, a0, 3 -; ZVFHMIN64-NEXT: sub sp, sp, a0 -; ZVFHMIN64-NEXT: andi sp, sp, -32 -; ZVFHMIN64-NEXT: addi a0, sp, 32 -; ZVFHMIN64-NEXT: vs4r.v v8, (a0) -; ZVFHMIN64-NEXT: csrr a1, vlenb -; ZVFHMIN64-NEXT: slli a1, a1, 2 -; ZVFHMIN64-NEXT: add a0, a0, a1 -; ZVFHMIN64-NEXT: li a2, 64 -; ZVFHMIN64-NEXT: vs4r.v v12, (a0) -; ZVFHMIN64-NEXT: bltu a1, a2, .LBB120_2 -; ZVFHMIN64-NEXT: # %bb.1: -; ZVFHMIN64-NEXT: li a1, 64 -; ZVFHMIN64-NEXT: .LBB120_2: -; ZVFHMIN64-NEXT: sub a0, a0, a1 -; ZVFHMIN64-NEXT: vl4re16.v v8, (a0) -; ZVFHMIN64-NEXT: addi sp, s0, -48 -; ZVFHMIN64-NEXT: lw ra, 44(sp) # 4-byte Folded Reload -; ZVFHMIN64-NEXT: lw s0, 40(sp) # 4-byte Folded Reload -; ZVFHMIN64-NEXT: addi sp, sp, 48 -; ZVFHMIN64-NEXT: ret -; -; ZVFHMIN32-LABEL: splice_nxv16bf16_offset_min: -; ZVFHMIN32: # %bb.0: -; ZVFHMIN32-NEXT: addi sp, sp, -48 -; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 48 -; ZVFHMIN32-NEXT: sd ra, 40(sp) # 8-byte Folded Spill -; ZVFHMIN32-NEXT: sd s0, 32(sp) # 8-byte Folded Spill -; ZVFHMIN32-NEXT: .cfi_offset ra, -8 -; ZVFHMIN32-NEXT: .cfi_offset s0, -16 -; ZVFHMIN32-NEXT: addi s0, sp, 48 -; ZVFHMIN32-NEXT: .cfi_def_cfa s0, 0 -; ZVFHMIN32-NEXT: csrr a0, vlenb -; ZVFHMIN32-NEXT: slli a0, a0, 3 -; ZVFHMIN32-NEXT: sub sp, sp, a0 -; ZVFHMIN32-NEXT: andi sp, sp, -32 -; ZVFHMIN32-NEXT: addi a0, sp, 32 -; ZVFHMIN32-NEXT: vs4r.v v8, (a0) -; ZVFHMIN32-NEXT: csrr a1, vlenb -; ZVFHMIN32-NEXT: slli a1, a1, 2 -; ZVFHMIN32-NEXT: add a0, a0, a1 -; ZVFHMIN32-NEXT: li a2, 64 -; ZVFHMIN32-NEXT: vs4r.v v12, (a0) -; ZVFHMIN32-NEXT: bltu a1, a2, .LBB120_2 -; ZVFHMIN32-NEXT: # %bb.1: -; ZVFHMIN32-NEXT: li a1, 64 -; ZVFHMIN32-NEXT: .LBB120_2: -; ZVFHMIN32-NEXT: sub a0, a0, a1 -; ZVFHMIN32-NEXT: vl4re16.v v8, (a0) -; ZVFHMIN32-NEXT: addi sp, s0, -48 -; ZVFHMIN32-NEXT: ld ra, 40(sp) # 8-byte Folded Reload -; ZVFHMIN32-NEXT: ld s0, 32(sp) # 8-byte Folded Reload -; ZVFHMIN32-NEXT: addi sp, sp, 48 -; ZVFHMIN32-NEXT: ret -; -; ZVFH32-LABEL: splice_nxv16bf16_offset_min: -; ZVFH32: # %bb.0: -; ZVFH32-NEXT: addi sp, sp, -48 -; ZVFH32-NEXT: .cfi_def_cfa_offset 48 -; ZVFH32-NEXT: sw ra, 44(sp) # 4-byte Folded Spill -; ZVFH32-NEXT: sw s0, 40(sp) # 4-byte Folded Spill -; ZVFH32-NEXT: .cfi_offset ra, -4 -; ZVFH32-NEXT: .cfi_offset s0, -8 -; ZVFH32-NEXT: addi s0, sp, 48 -; ZVFH32-NEXT: .cfi_def_cfa s0, 0 -; ZVFH32-NEXT: csrr a0, vlenb -; ZVFH32-NEXT: slli a0, a0, 3 -; ZVFH32-NEXT: sub sp, sp, a0 -; ZVFH32-NEXT: andi sp, sp, -32 -; ZVFH32-NEXT: addi a0, sp, 32 -; ZVFH32-NEXT: vs4r.v v8, (a0) -; ZVFH32-NEXT: csrr a1, vlenb -; ZVFH32-NEXT: slli a1, a1, 2 -; ZVFH32-NEXT: add a0, a0, a1 -; ZVFH32-NEXT: li a2, 64 -; ZVFH32-NEXT: vs4r.v v12, (a0) -; ZVFH32-NEXT: bltu a1, a2, .LBB120_2 -; ZVFH32-NEXT: # %bb.1: -; ZVFH32-NEXT: li a1, 64 -; ZVFH32-NEXT: .LBB120_2: -; ZVFH32-NEXT: sub a0, a0, a1 -; ZVFH32-NEXT: vl4re16.v v8, (a0) -; ZVFH32-NEXT: addi sp, s0, -48 -; ZVFH32-NEXT: lw ra, 44(sp) # 4-byte Folded Reload -; ZVFH32-NEXT: lw s0, 40(sp) # 4-byte Folded Reload -; ZVFH32-NEXT: addi sp, sp, 48 -; ZVFH32-NEXT: ret -; -; ZVFH64-LABEL: splice_nxv16bf16_offset_min: -; ZVFH64: # %bb.0: -; ZVFH64-NEXT: addi sp, sp, -48 -; ZVFH64-NEXT: .cfi_def_cfa_offset 48 -; ZVFH64-NEXT: sd ra, 40(sp) # 8-byte Folded Spill -; ZVFH64-NEXT: sd s0, 32(sp) # 8-byte Folded Spill -; ZVFH64-NEXT: .cfi_offset ra, -8 -; ZVFH64-NEXT: .cfi_offset s0, -16 -; ZVFH64-NEXT: addi s0, sp, 48 -; ZVFH64-NEXT: .cfi_def_cfa s0, 0 -; ZVFH64-NEXT: csrr a0, vlenb -; ZVFH64-NEXT: slli a0, a0, 3 -; ZVFH64-NEXT: sub sp, sp, a0 -; ZVFH64-NEXT: andi sp, sp, -32 -; ZVFH64-NEXT: addi a0, sp, 32 -; ZVFH64-NEXT: vs4r.v v8, (a0) -; ZVFH64-NEXT: csrr a1, vlenb -; ZVFH64-NEXT: slli a1, a1, 2 -; ZVFH64-NEXT: add a0, a0, a1 -; ZVFH64-NEXT: li a2, 64 -; ZVFH64-NEXT: vs4r.v v12, (a0) -; ZVFH64-NEXT: bltu a1, a2, .LBB120_2 -; ZVFH64-NEXT: # %bb.1: -; ZVFH64-NEXT: li a1, 64 -; ZVFH64-NEXT: .LBB120_2: -; ZVFH64-NEXT: sub a0, a0, a1 -; ZVFH64-NEXT: vl4re16.v v8, (a0) -; ZVFH64-NEXT: addi sp, s0, -48 -; ZVFH64-NEXT: ld ra, 40(sp) # 8-byte Folded Reload -; ZVFH64-NEXT: ld s0, 32(sp) # 8-byte Folded Reload -; ZVFH64-NEXT: addi sp, sp, 48 -; ZVFH64-NEXT: ret +; CHECK-LABEL: splice_nxv16bf16_offset_min: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: addi a0, a0, -32 +; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vslideup.vx v8, v12, a1 +; CHECK-NEXT: ret %res = call @llvm.vector.splice.nxv16bf16( %a, %b, i32 -32) ret %res } define @splice_nxv16bf16_offset_max( %a, %b) #0 { -; ZVFHMIN64-LABEL: splice_nxv16bf16_offset_max: -; ZVFHMIN64: # %bb.0: -; ZVFHMIN64-NEXT: addi sp, sp, -48 -; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 48 -; ZVFHMIN64-NEXT: sw ra, 44(sp) # 4-byte Folded Spill -; ZVFHMIN64-NEXT: sw s0, 40(sp) # 4-byte Folded Spill -; ZVFHMIN64-NEXT: .cfi_offset ra, -4 -; ZVFHMIN64-NEXT: .cfi_offset s0, -8 -; ZVFHMIN64-NEXT: addi s0, sp, 48 -; ZVFHMIN64-NEXT: .cfi_def_cfa s0, 0 -; ZVFHMIN64-NEXT: csrr a0, vlenb -; ZVFHMIN64-NEXT: slli a0, a0, 3 -; ZVFHMIN64-NEXT: sub sp, sp, a0 -; ZVFHMIN64-NEXT: andi sp, sp, -32 -; ZVFHMIN64-NEXT: addi a0, sp, 32 -; ZVFHMIN64-NEXT: vs4r.v v8, (a0) -; ZVFHMIN64-NEXT: csrr a1, vlenb -; ZVFHMIN64-NEXT: slli a2, a1, 2 -; ZVFHMIN64-NEXT: add a2, a0, a2 -; ZVFHMIN64-NEXT: slli a1, a1, 1 -; ZVFHMIN64-NEXT: addi a1, a1, -1 -; ZVFHMIN64-NEXT: li a3, 31 -; ZVFHMIN64-NEXT: vs4r.v v12, (a2) -; ZVFHMIN64-NEXT: bltu a1, a3, .LBB121_2 -; ZVFHMIN64-NEXT: # %bb.1: -; ZVFHMIN64-NEXT: li a1, 31 -; ZVFHMIN64-NEXT: .LBB121_2: -; ZVFHMIN64-NEXT: slli a1, a1, 1 -; ZVFHMIN64-NEXT: add a0, a0, a1 -; ZVFHMIN64-NEXT: vl4re16.v v8, (a0) -; ZVFHMIN64-NEXT: addi sp, s0, -48 -; ZVFHMIN64-NEXT: lw ra, 44(sp) # 4-byte Folded Reload -; ZVFHMIN64-NEXT: lw s0, 40(sp) # 4-byte Folded Reload -; ZVFHMIN64-NEXT: addi sp, sp, 48 -; ZVFHMIN64-NEXT: ret -; -; ZVFHMIN32-LABEL: splice_nxv16bf16_offset_max: -; ZVFHMIN32: # %bb.0: -; ZVFHMIN32-NEXT: addi sp, sp, -48 -; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 48 -; ZVFHMIN32-NEXT: sd ra, 40(sp) # 8-byte Folded Spill -; ZVFHMIN32-NEXT: sd s0, 32(sp) # 8-byte Folded Spill -; ZVFHMIN32-NEXT: .cfi_offset ra, -8 -; ZVFHMIN32-NEXT: .cfi_offset s0, -16 -; ZVFHMIN32-NEXT: addi s0, sp, 48 -; ZVFHMIN32-NEXT: .cfi_def_cfa s0, 0 -; ZVFHMIN32-NEXT: csrr a0, vlenb -; ZVFHMIN32-NEXT: slli a0, a0, 3 -; ZVFHMIN32-NEXT: sub sp, sp, a0 -; ZVFHMIN32-NEXT: andi sp, sp, -32 -; ZVFHMIN32-NEXT: addi a0, sp, 32 -; ZVFHMIN32-NEXT: vs4r.v v8, (a0) -; ZVFHMIN32-NEXT: csrr a1, vlenb -; ZVFHMIN32-NEXT: slli a2, a1, 2 -; ZVFHMIN32-NEXT: add a2, a0, a2 -; ZVFHMIN32-NEXT: slli a1, a1, 1 -; ZVFHMIN32-NEXT: addi a1, a1, -1 -; ZVFHMIN32-NEXT: li a3, 31 -; ZVFHMIN32-NEXT: vs4r.v v12, (a2) -; ZVFHMIN32-NEXT: bltu a1, a3, .LBB121_2 -; ZVFHMIN32-NEXT: # %bb.1: -; ZVFHMIN32-NEXT: li a1, 31 -; ZVFHMIN32-NEXT: .LBB121_2: -; ZVFHMIN32-NEXT: slli a1, a1, 1 -; ZVFHMIN32-NEXT: add a0, a0, a1 -; ZVFHMIN32-NEXT: vl4re16.v v8, (a0) -; ZVFHMIN32-NEXT: addi sp, s0, -48 -; ZVFHMIN32-NEXT: ld ra, 40(sp) # 8-byte Folded Reload -; ZVFHMIN32-NEXT: ld s0, 32(sp) # 8-byte Folded Reload -; ZVFHMIN32-NEXT: addi sp, sp, 48 -; ZVFHMIN32-NEXT: ret -; -; ZVFH32-LABEL: splice_nxv16bf16_offset_max: -; ZVFH32: # %bb.0: -; ZVFH32-NEXT: addi sp, sp, -48 -; ZVFH32-NEXT: .cfi_def_cfa_offset 48 -; ZVFH32-NEXT: sw ra, 44(sp) # 4-byte Folded Spill -; ZVFH32-NEXT: sw s0, 40(sp) # 4-byte Folded Spill -; ZVFH32-NEXT: .cfi_offset ra, -4 -; ZVFH32-NEXT: .cfi_offset s0, -8 -; ZVFH32-NEXT: addi s0, sp, 48 -; ZVFH32-NEXT: .cfi_def_cfa s0, 0 -; ZVFH32-NEXT: csrr a0, vlenb -; ZVFH32-NEXT: slli a0, a0, 3 -; ZVFH32-NEXT: sub sp, sp, a0 -; ZVFH32-NEXT: andi sp, sp, -32 -; ZVFH32-NEXT: addi a0, sp, 32 -; ZVFH32-NEXT: vs4r.v v8, (a0) -; ZVFH32-NEXT: csrr a1, vlenb -; ZVFH32-NEXT: slli a2, a1, 2 -; ZVFH32-NEXT: add a2, a0, a2 -; ZVFH32-NEXT: slli a1, a1, 1 -; ZVFH32-NEXT: addi a1, a1, -1 -; ZVFH32-NEXT: li a3, 31 -; ZVFH32-NEXT: vs4r.v v12, (a2) -; ZVFH32-NEXT: bltu a1, a3, .LBB121_2 -; ZVFH32-NEXT: # %bb.1: -; ZVFH32-NEXT: li a1, 31 -; ZVFH32-NEXT: .LBB121_2: -; ZVFH32-NEXT: slli a1, a1, 1 -; ZVFH32-NEXT: add a0, a0, a1 -; ZVFH32-NEXT: vl4re16.v v8, (a0) -; ZVFH32-NEXT: addi sp, s0, -48 -; ZVFH32-NEXT: lw ra, 44(sp) # 4-byte Folded Reload -; ZVFH32-NEXT: lw s0, 40(sp) # 4-byte Folded Reload -; ZVFH32-NEXT: addi sp, sp, 48 -; ZVFH32-NEXT: ret -; -; ZVFH64-LABEL: splice_nxv16bf16_offset_max: -; ZVFH64: # %bb.0: -; ZVFH64-NEXT: addi sp, sp, -48 -; ZVFH64-NEXT: .cfi_def_cfa_offset 48 -; ZVFH64-NEXT: sd ra, 40(sp) # 8-byte Folded Spill -; ZVFH64-NEXT: sd s0, 32(sp) # 8-byte Folded Spill -; ZVFH64-NEXT: .cfi_offset ra, -8 -; ZVFH64-NEXT: .cfi_offset s0, -16 -; ZVFH64-NEXT: addi s0, sp, 48 -; ZVFH64-NEXT: .cfi_def_cfa s0, 0 -; ZVFH64-NEXT: csrr a0, vlenb -; ZVFH64-NEXT: slli a0, a0, 3 -; ZVFH64-NEXT: sub sp, sp, a0 -; ZVFH64-NEXT: andi sp, sp, -32 -; ZVFH64-NEXT: addi a0, sp, 32 -; ZVFH64-NEXT: vs4r.v v8, (a0) -; ZVFH64-NEXT: csrr a1, vlenb -; ZVFH64-NEXT: slli a2, a1, 2 -; ZVFH64-NEXT: add a2, a0, a2 -; ZVFH64-NEXT: slli a1, a1, 1 -; ZVFH64-NEXT: addi a1, a1, -1 -; ZVFH64-NEXT: li a3, 31 -; ZVFH64-NEXT: vs4r.v v12, (a2) -; ZVFH64-NEXT: bltu a1, a3, .LBB121_2 -; ZVFH64-NEXT: # %bb.1: -; ZVFH64-NEXT: li a1, 31 -; ZVFH64-NEXT: .LBB121_2: -; ZVFH64-NEXT: slli a1, a1, 1 -; ZVFH64-NEXT: add a0, a0, a1 -; ZVFH64-NEXT: vl4re16.v v8, (a0) -; ZVFH64-NEXT: addi sp, s0, -48 -; ZVFH64-NEXT: ld ra, 40(sp) # 8-byte Folded Reload -; ZVFH64-NEXT: ld s0, 32(sp) # 8-byte Folded Reload -; ZVFH64-NEXT: addi sp, sp, 48 -; ZVFH64-NEXT: ret +; CHECK-LABEL: splice_nxv16bf16_offset_max: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: addi a0, a0, -31 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vslidedown.vi v8, v8, 31 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vslideup.vx v8, v12, a0 +; CHECK-NEXT: ret %res = call @llvm.vector.splice.nxv16bf16( %a, %b, i32 31) ret %res } @@ -2350,401 +1816,48 @@ define @splice_nxv32bf16_offset_zero( @splice_nxv32bf16_offset_negone( %a, %b) #0 { -; ZVFHMIN64-LABEL: splice_nxv32bf16_offset_negone: -; ZVFHMIN64: # %bb.0: -; ZVFHMIN64-NEXT: addi sp, sp, -80 -; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 80 -; ZVFHMIN64-NEXT: sw ra, 76(sp) # 4-byte Folded Spill -; ZVFHMIN64-NEXT: sw s0, 72(sp) # 4-byte Folded Spill -; ZVFHMIN64-NEXT: .cfi_offset ra, -4 -; ZVFHMIN64-NEXT: .cfi_offset s0, -8 -; ZVFHMIN64-NEXT: addi s0, sp, 80 -; ZVFHMIN64-NEXT: .cfi_def_cfa s0, 0 -; ZVFHMIN64-NEXT: csrr a0, vlenb -; ZVFHMIN64-NEXT: slli a0, a0, 4 -; ZVFHMIN64-NEXT: sub sp, sp, a0 -; ZVFHMIN64-NEXT: andi sp, sp, -64 -; ZVFHMIN64-NEXT: addi a0, sp, 64 -; ZVFHMIN64-NEXT: vs8r.v v8, (a0) -; ZVFHMIN64-NEXT: csrr a1, vlenb -; ZVFHMIN64-NEXT: slli a1, a1, 3 -; ZVFHMIN64-NEXT: add a0, a0, a1 -; ZVFHMIN64-NEXT: vs8r.v v16, (a0) -; ZVFHMIN64-NEXT: addi a0, a0, -2 -; ZVFHMIN64-NEXT: vl8re16.v v8, (a0) -; ZVFHMIN64-NEXT: addi sp, s0, -80 -; ZVFHMIN64-NEXT: lw ra, 76(sp) # 4-byte Folded Reload -; ZVFHMIN64-NEXT: lw s0, 72(sp) # 4-byte Folded Reload -; ZVFHMIN64-NEXT: addi sp, sp, 80 -; ZVFHMIN64-NEXT: ret -; -; ZVFHMIN32-LABEL: splice_nxv32bf16_offset_negone: -; ZVFHMIN32: # %bb.0: -; ZVFHMIN32-NEXT: addi sp, sp, -80 -; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 80 -; ZVFHMIN32-NEXT: sd ra, 72(sp) # 8-byte Folded Spill -; ZVFHMIN32-NEXT: sd s0, 64(sp) # 8-byte Folded Spill -; ZVFHMIN32-NEXT: .cfi_offset ra, -8 -; ZVFHMIN32-NEXT: .cfi_offset s0, -16 -; ZVFHMIN32-NEXT: addi s0, sp, 80 -; ZVFHMIN32-NEXT: .cfi_def_cfa s0, 0 -; ZVFHMIN32-NEXT: csrr a0, vlenb -; ZVFHMIN32-NEXT: slli a0, a0, 4 -; ZVFHMIN32-NEXT: sub sp, sp, a0 -; ZVFHMIN32-NEXT: andi sp, sp, -64 -; ZVFHMIN32-NEXT: addi a0, sp, 64 -; ZVFHMIN32-NEXT: vs8r.v v8, (a0) -; ZVFHMIN32-NEXT: csrr a1, vlenb -; ZVFHMIN32-NEXT: slli a1, a1, 3 -; ZVFHMIN32-NEXT: add a0, a0, a1 -; ZVFHMIN32-NEXT: vs8r.v v16, (a0) -; ZVFHMIN32-NEXT: addi a0, a0, -2 -; ZVFHMIN32-NEXT: vl8re16.v v8, (a0) -; ZVFHMIN32-NEXT: addi sp, s0, -80 -; ZVFHMIN32-NEXT: ld ra, 72(sp) # 8-byte Folded Reload -; ZVFHMIN32-NEXT: ld s0, 64(sp) # 8-byte Folded Reload -; ZVFHMIN32-NEXT: addi sp, sp, 80 -; ZVFHMIN32-NEXT: ret -; -; ZVFH32-LABEL: splice_nxv32bf16_offset_negone: -; ZVFH32: # %bb.0: -; ZVFH32-NEXT: addi sp, sp, -80 -; ZVFH32-NEXT: .cfi_def_cfa_offset 80 -; ZVFH32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill -; ZVFH32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill -; ZVFH32-NEXT: .cfi_offset ra, -4 -; ZVFH32-NEXT: .cfi_offset s0, -8 -; ZVFH32-NEXT: addi s0, sp, 80 -; ZVFH32-NEXT: .cfi_def_cfa s0, 0 -; ZVFH32-NEXT: csrr a0, vlenb -; ZVFH32-NEXT: slli a0, a0, 4 -; ZVFH32-NEXT: sub sp, sp, a0 -; ZVFH32-NEXT: andi sp, sp, -64 -; ZVFH32-NEXT: addi a0, sp, 64 -; ZVFH32-NEXT: vs8r.v v8, (a0) -; ZVFH32-NEXT: csrr a1, vlenb -; ZVFH32-NEXT: slli a1, a1, 3 -; ZVFH32-NEXT: add a0, a0, a1 -; ZVFH32-NEXT: vs8r.v v16, (a0) -; ZVFH32-NEXT: addi a0, a0, -2 -; ZVFH32-NEXT: vl8re16.v v8, (a0) -; ZVFH32-NEXT: addi sp, s0, -80 -; ZVFH32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload -; ZVFH32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload -; ZVFH32-NEXT: addi sp, sp, 80 -; ZVFH32-NEXT: ret -; -; ZVFH64-LABEL: splice_nxv32bf16_offset_negone: -; ZVFH64: # %bb.0: -; ZVFH64-NEXT: addi sp, sp, -80 -; ZVFH64-NEXT: .cfi_def_cfa_offset 80 -; ZVFH64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill -; ZVFH64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill -; ZVFH64-NEXT: .cfi_offset ra, -8 -; ZVFH64-NEXT: .cfi_offset s0, -16 -; ZVFH64-NEXT: addi s0, sp, 80 -; ZVFH64-NEXT: .cfi_def_cfa s0, 0 -; ZVFH64-NEXT: csrr a0, vlenb -; ZVFH64-NEXT: slli a0, a0, 4 -; ZVFH64-NEXT: sub sp, sp, a0 -; ZVFH64-NEXT: andi sp, sp, -64 -; ZVFH64-NEXT: addi a0, sp, 64 -; ZVFH64-NEXT: vs8r.v v8, (a0) -; ZVFH64-NEXT: csrr a1, vlenb -; ZVFH64-NEXT: slli a1, a1, 3 -; ZVFH64-NEXT: add a0, a0, a1 -; ZVFH64-NEXT: vs8r.v v16, (a0) -; ZVFH64-NEXT: addi a0, a0, -2 -; ZVFH64-NEXT: vl8re16.v v8, (a0) -; ZVFH64-NEXT: addi sp, s0, -80 -; ZVFH64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload -; ZVFH64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload -; ZVFH64-NEXT: addi sp, sp, 80 -; ZVFH64-NEXT: ret +; CHECK-LABEL: splice_nxv32bf16_offset_negone: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetivli zero, 1, e16, m8, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, ma +; CHECK-NEXT: vslideup.vi v8, v16, 1 +; CHECK-NEXT: ret %res = call @llvm.vector.splice.nxv32bf16( %a, %b, i32 -1) ret %res } define @splice_nxv32bf16_offset_min( %a, %b) #0 { -; ZVFHMIN64-LABEL: splice_nxv32bf16_offset_min: -; ZVFHMIN64: # %bb.0: -; ZVFHMIN64-NEXT: addi sp, sp, -80 -; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 80 -; ZVFHMIN64-NEXT: sw ra, 76(sp) # 4-byte Folded Spill -; ZVFHMIN64-NEXT: sw s0, 72(sp) # 4-byte Folded Spill -; ZVFHMIN64-NEXT: .cfi_offset ra, -4 -; ZVFHMIN64-NEXT: .cfi_offset s0, -8 -; ZVFHMIN64-NEXT: addi s0, sp, 80 -; ZVFHMIN64-NEXT: .cfi_def_cfa s0, 0 -; ZVFHMIN64-NEXT: csrr a0, vlenb -; ZVFHMIN64-NEXT: slli a0, a0, 4 -; ZVFHMIN64-NEXT: sub sp, sp, a0 -; ZVFHMIN64-NEXT: andi sp, sp, -64 -; ZVFHMIN64-NEXT: addi a0, sp, 64 -; ZVFHMIN64-NEXT: vs8r.v v8, (a0) -; ZVFHMIN64-NEXT: csrr a1, vlenb -; ZVFHMIN64-NEXT: slli a1, a1, 3 -; ZVFHMIN64-NEXT: add a0, a0, a1 -; ZVFHMIN64-NEXT: li a2, 128 -; ZVFHMIN64-NEXT: vs8r.v v16, (a0) -; ZVFHMIN64-NEXT: bltu a1, a2, .LBB124_2 -; ZVFHMIN64-NEXT: # %bb.1: -; ZVFHMIN64-NEXT: li a1, 128 -; ZVFHMIN64-NEXT: .LBB124_2: -; ZVFHMIN64-NEXT: sub a0, a0, a1 -; ZVFHMIN64-NEXT: vl8re16.v v8, (a0) -; ZVFHMIN64-NEXT: addi sp, s0, -80 -; ZVFHMIN64-NEXT: lw ra, 76(sp) # 4-byte Folded Reload -; ZVFHMIN64-NEXT: lw s0, 72(sp) # 4-byte Folded Reload -; ZVFHMIN64-NEXT: addi sp, sp, 80 -; ZVFHMIN64-NEXT: ret -; -; ZVFHMIN32-LABEL: splice_nxv32bf16_offset_min: -; ZVFHMIN32: # %bb.0: -; ZVFHMIN32-NEXT: addi sp, sp, -80 -; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 80 -; ZVFHMIN32-NEXT: sd ra, 72(sp) # 8-byte Folded Spill -; ZVFHMIN32-NEXT: sd s0, 64(sp) # 8-byte Folded Spill -; ZVFHMIN32-NEXT: .cfi_offset ra, -8 -; ZVFHMIN32-NEXT: .cfi_offset s0, -16 -; ZVFHMIN32-NEXT: addi s0, sp, 80 -; ZVFHMIN32-NEXT: .cfi_def_cfa s0, 0 -; ZVFHMIN32-NEXT: csrr a0, vlenb -; ZVFHMIN32-NEXT: slli a0, a0, 4 -; ZVFHMIN32-NEXT: sub sp, sp, a0 -; ZVFHMIN32-NEXT: andi sp, sp, -64 -; ZVFHMIN32-NEXT: addi a0, sp, 64 -; ZVFHMIN32-NEXT: vs8r.v v8, (a0) -; ZVFHMIN32-NEXT: csrr a1, vlenb -; ZVFHMIN32-NEXT: slli a1, a1, 3 -; ZVFHMIN32-NEXT: add a0, a0, a1 -; ZVFHMIN32-NEXT: li a2, 128 -; ZVFHMIN32-NEXT: vs8r.v v16, (a0) -; ZVFHMIN32-NEXT: bltu a1, a2, .LBB124_2 -; ZVFHMIN32-NEXT: # %bb.1: -; ZVFHMIN32-NEXT: li a1, 128 -; ZVFHMIN32-NEXT: .LBB124_2: -; ZVFHMIN32-NEXT: sub a0, a0, a1 -; ZVFHMIN32-NEXT: vl8re16.v v8, (a0) -; ZVFHMIN32-NEXT: addi sp, s0, -80 -; ZVFHMIN32-NEXT: ld ra, 72(sp) # 8-byte Folded Reload -; ZVFHMIN32-NEXT: ld s0, 64(sp) # 8-byte Folded Reload -; ZVFHMIN32-NEXT: addi sp, sp, 80 -; ZVFHMIN32-NEXT: ret -; -; ZVFH32-LABEL: splice_nxv32bf16_offset_min: -; ZVFH32: # %bb.0: -; ZVFH32-NEXT: addi sp, sp, -80 -; ZVFH32-NEXT: .cfi_def_cfa_offset 80 -; ZVFH32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill -; ZVFH32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill -; ZVFH32-NEXT: .cfi_offset ra, -4 -; ZVFH32-NEXT: .cfi_offset s0, -8 -; ZVFH32-NEXT: addi s0, sp, 80 -; ZVFH32-NEXT: .cfi_def_cfa s0, 0 -; ZVFH32-NEXT: csrr a0, vlenb -; ZVFH32-NEXT: slli a0, a0, 4 -; ZVFH32-NEXT: sub sp, sp, a0 -; ZVFH32-NEXT: andi sp, sp, -64 -; ZVFH32-NEXT: addi a0, sp, 64 -; ZVFH32-NEXT: vs8r.v v8, (a0) -; ZVFH32-NEXT: csrr a1, vlenb -; ZVFH32-NEXT: slli a1, a1, 3 -; ZVFH32-NEXT: add a0, a0, a1 -; ZVFH32-NEXT: li a2, 128 -; ZVFH32-NEXT: vs8r.v v16, (a0) -; ZVFH32-NEXT: bltu a1, a2, .LBB124_2 -; ZVFH32-NEXT: # %bb.1: -; ZVFH32-NEXT: li a1, 128 -; ZVFH32-NEXT: .LBB124_2: -; ZVFH32-NEXT: sub a0, a0, a1 -; ZVFH32-NEXT: vl8re16.v v8, (a0) -; ZVFH32-NEXT: addi sp, s0, -80 -; ZVFH32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload -; ZVFH32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload -; ZVFH32-NEXT: addi sp, sp, 80 -; ZVFH32-NEXT: ret -; -; ZVFH64-LABEL: splice_nxv32bf16_offset_min: -; ZVFH64: # %bb.0: -; ZVFH64-NEXT: addi sp, sp, -80 -; ZVFH64-NEXT: .cfi_def_cfa_offset 80 -; ZVFH64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill -; ZVFH64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill -; ZVFH64-NEXT: .cfi_offset ra, -8 -; ZVFH64-NEXT: .cfi_offset s0, -16 -; ZVFH64-NEXT: addi s0, sp, 80 -; ZVFH64-NEXT: .cfi_def_cfa s0, 0 -; ZVFH64-NEXT: csrr a0, vlenb -; ZVFH64-NEXT: slli a0, a0, 4 -; ZVFH64-NEXT: sub sp, sp, a0 -; ZVFH64-NEXT: andi sp, sp, -64 -; ZVFH64-NEXT: addi a0, sp, 64 -; ZVFH64-NEXT: vs8r.v v8, (a0) -; ZVFH64-NEXT: csrr a1, vlenb -; ZVFH64-NEXT: slli a1, a1, 3 -; ZVFH64-NEXT: add a0, a0, a1 -; ZVFH64-NEXT: li a2, 128 -; ZVFH64-NEXT: vs8r.v v16, (a0) -; ZVFH64-NEXT: bltu a1, a2, .LBB124_2 -; ZVFH64-NEXT: # %bb.1: -; ZVFH64-NEXT: li a1, 128 -; ZVFH64-NEXT: .LBB124_2: -; ZVFH64-NEXT: sub a0, a0, a1 -; ZVFH64-NEXT: vl8re16.v v8, (a0) -; ZVFH64-NEXT: addi sp, s0, -80 -; ZVFH64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload -; ZVFH64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload -; ZVFH64-NEXT: addi sp, sp, 80 -; ZVFH64-NEXT: ret +; CHECK-LABEL: splice_nxv32bf16_offset_min: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: addi a0, a0, -64 +; CHECK-NEXT: li a1, 64 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, ma +; CHECK-NEXT: vslideup.vx v8, v16, a1 +; CHECK-NEXT: ret %res = call @llvm.vector.splice.nxv32bf16( %a, %b, i32 -64) ret %res } define @splice_nxv32bf16_offset_max( %a, %b) #0 { -; ZVFHMIN64-LABEL: splice_nxv32bf16_offset_max: -; ZVFHMIN64: # %bb.0: -; ZVFHMIN64-NEXT: addi sp, sp, -80 -; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 80 -; ZVFHMIN64-NEXT: sw ra, 76(sp) # 4-byte Folded Spill -; ZVFHMIN64-NEXT: sw s0, 72(sp) # 4-byte Folded Spill -; ZVFHMIN64-NEXT: .cfi_offset ra, -4 -; ZVFHMIN64-NEXT: .cfi_offset s0, -8 -; ZVFHMIN64-NEXT: addi s0, sp, 80 -; ZVFHMIN64-NEXT: .cfi_def_cfa s0, 0 -; ZVFHMIN64-NEXT: csrr a0, vlenb -; ZVFHMIN64-NEXT: slli a0, a0, 4 -; ZVFHMIN64-NEXT: sub sp, sp, a0 -; ZVFHMIN64-NEXT: andi sp, sp, -64 -; ZVFHMIN64-NEXT: addi a0, sp, 64 -; ZVFHMIN64-NEXT: vs8r.v v8, (a0) -; ZVFHMIN64-NEXT: csrr a1, vlenb -; ZVFHMIN64-NEXT: slli a2, a1, 3 -; ZVFHMIN64-NEXT: add a2, a0, a2 -; ZVFHMIN64-NEXT: slli a1, a1, 2 -; ZVFHMIN64-NEXT: addi a1, a1, -1 -; ZVFHMIN64-NEXT: li a3, 63 -; ZVFHMIN64-NEXT: vs8r.v v16, (a2) -; ZVFHMIN64-NEXT: bltu a1, a3, .LBB125_2 -; ZVFHMIN64-NEXT: # %bb.1: -; ZVFHMIN64-NEXT: li a1, 63 -; ZVFHMIN64-NEXT: .LBB125_2: -; ZVFHMIN64-NEXT: slli a1, a1, 1 -; ZVFHMIN64-NEXT: add a0, a0, a1 -; ZVFHMIN64-NEXT: vl8re16.v v8, (a0) -; ZVFHMIN64-NEXT: addi sp, s0, -80 -; ZVFHMIN64-NEXT: lw ra, 76(sp) # 4-byte Folded Reload -; ZVFHMIN64-NEXT: lw s0, 72(sp) # 4-byte Folded Reload -; ZVFHMIN64-NEXT: addi sp, sp, 80 -; ZVFHMIN64-NEXT: ret -; -; ZVFHMIN32-LABEL: splice_nxv32bf16_offset_max: -; ZVFHMIN32: # %bb.0: -; ZVFHMIN32-NEXT: addi sp, sp, -80 -; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 80 -; ZVFHMIN32-NEXT: sd ra, 72(sp) # 8-byte Folded Spill -; ZVFHMIN32-NEXT: sd s0, 64(sp) # 8-byte Folded Spill -; ZVFHMIN32-NEXT: .cfi_offset ra, -8 -; ZVFHMIN32-NEXT: .cfi_offset s0, -16 -; ZVFHMIN32-NEXT: addi s0, sp, 80 -; ZVFHMIN32-NEXT: .cfi_def_cfa s0, 0 -; ZVFHMIN32-NEXT: csrr a0, vlenb -; ZVFHMIN32-NEXT: slli a0, a0, 4 -; ZVFHMIN32-NEXT: sub sp, sp, a0 -; ZVFHMIN32-NEXT: andi sp, sp, -64 -; ZVFHMIN32-NEXT: addi a0, sp, 64 -; ZVFHMIN32-NEXT: vs8r.v v8, (a0) -; ZVFHMIN32-NEXT: csrr a1, vlenb -; ZVFHMIN32-NEXT: slli a2, a1, 3 -; ZVFHMIN32-NEXT: add a2, a0, a2 -; ZVFHMIN32-NEXT: slli a1, a1, 2 -; ZVFHMIN32-NEXT: addi a1, a1, -1 -; ZVFHMIN32-NEXT: li a3, 63 -; ZVFHMIN32-NEXT: vs8r.v v16, (a2) -; ZVFHMIN32-NEXT: bltu a1, a3, .LBB125_2 -; ZVFHMIN32-NEXT: # %bb.1: -; ZVFHMIN32-NEXT: li a1, 63 -; ZVFHMIN32-NEXT: .LBB125_2: -; ZVFHMIN32-NEXT: slli a1, a1, 1 -; ZVFHMIN32-NEXT: add a0, a0, a1 -; ZVFHMIN32-NEXT: vl8re16.v v8, (a0) -; ZVFHMIN32-NEXT: addi sp, s0, -80 -; ZVFHMIN32-NEXT: ld ra, 72(sp) # 8-byte Folded Reload -; ZVFHMIN32-NEXT: ld s0, 64(sp) # 8-byte Folded Reload -; ZVFHMIN32-NEXT: addi sp, sp, 80 -; ZVFHMIN32-NEXT: ret -; -; ZVFH32-LABEL: splice_nxv32bf16_offset_max: -; ZVFH32: # %bb.0: -; ZVFH32-NEXT: addi sp, sp, -80 -; ZVFH32-NEXT: .cfi_def_cfa_offset 80 -; ZVFH32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill -; ZVFH32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill -; ZVFH32-NEXT: .cfi_offset ra, -4 -; ZVFH32-NEXT: .cfi_offset s0, -8 -; ZVFH32-NEXT: addi s0, sp, 80 -; ZVFH32-NEXT: .cfi_def_cfa s0, 0 -; ZVFH32-NEXT: csrr a0, vlenb -; ZVFH32-NEXT: slli a0, a0, 4 -; ZVFH32-NEXT: sub sp, sp, a0 -; ZVFH32-NEXT: andi sp, sp, -64 -; ZVFH32-NEXT: addi a0, sp, 64 -; ZVFH32-NEXT: vs8r.v v8, (a0) -; ZVFH32-NEXT: csrr a1, vlenb -; ZVFH32-NEXT: slli a2, a1, 3 -; ZVFH32-NEXT: add a2, a0, a2 -; ZVFH32-NEXT: slli a1, a1, 2 -; ZVFH32-NEXT: addi a1, a1, -1 -; ZVFH32-NEXT: li a3, 63 -; ZVFH32-NEXT: vs8r.v v16, (a2) -; ZVFH32-NEXT: bltu a1, a3, .LBB125_2 -; ZVFH32-NEXT: # %bb.1: -; ZVFH32-NEXT: li a1, 63 -; ZVFH32-NEXT: .LBB125_2: -; ZVFH32-NEXT: slli a1, a1, 1 -; ZVFH32-NEXT: add a0, a0, a1 -; ZVFH32-NEXT: vl8re16.v v8, (a0) -; ZVFH32-NEXT: addi sp, s0, -80 -; ZVFH32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload -; ZVFH32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload -; ZVFH32-NEXT: addi sp, sp, 80 -; ZVFH32-NEXT: ret -; -; ZVFH64-LABEL: splice_nxv32bf16_offset_max: -; ZVFH64: # %bb.0: -; ZVFH64-NEXT: addi sp, sp, -80 -; ZVFH64-NEXT: .cfi_def_cfa_offset 80 -; ZVFH64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill -; ZVFH64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill -; ZVFH64-NEXT: .cfi_offset ra, -8 -; ZVFH64-NEXT: .cfi_offset s0, -16 -; ZVFH64-NEXT: addi s0, sp, 80 -; ZVFH64-NEXT: .cfi_def_cfa s0, 0 -; ZVFH64-NEXT: csrr a0, vlenb -; ZVFH64-NEXT: slli a0, a0, 4 -; ZVFH64-NEXT: sub sp, sp, a0 -; ZVFH64-NEXT: andi sp, sp, -64 -; ZVFH64-NEXT: addi a0, sp, 64 -; ZVFH64-NEXT: vs8r.v v8, (a0) -; ZVFH64-NEXT: csrr a1, vlenb -; ZVFH64-NEXT: slli a2, a1, 3 -; ZVFH64-NEXT: add a2, a0, a2 -; ZVFH64-NEXT: slli a1, a1, 2 -; ZVFH64-NEXT: addi a1, a1, -1 -; ZVFH64-NEXT: li a3, 63 -; ZVFH64-NEXT: vs8r.v v16, (a2) -; ZVFH64-NEXT: bltu a1, a3, .LBB125_2 -; ZVFH64-NEXT: # %bb.1: -; ZVFH64-NEXT: li a1, 63 -; ZVFH64-NEXT: .LBB125_2: -; ZVFH64-NEXT: slli a1, a1, 1 -; ZVFH64-NEXT: add a0, a0, a1 -; ZVFH64-NEXT: vl8re16.v v8, (a0) -; ZVFH64-NEXT: addi sp, s0, -80 -; ZVFH64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload -; ZVFH64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload -; ZVFH64-NEXT: addi sp, sp, 80 -; ZVFH64-NEXT: ret +; CHECK-LABEL: splice_nxv32bf16_offset_max: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: addi a0, a0, -63 +; CHECK-NEXT: li a1, 63 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v8, a1 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma +; CHECK-NEXT: vslideup.vx v8, v16, a0 +; CHECK-NEXT: ret %res = call @llvm.vector.splice.nxv32bf16( %a, %b, i32 63) ret %res } @@ -2760,229 +1873,45 @@ define @splice_nxv1f16_offset_zero( %a, < } define @splice_nxv1f16_offset_negone( %a, %b) #0 { -; ZVFHMIN64-LABEL: splice_nxv1f16_offset_negone: -; ZVFHMIN64: # %bb.0: -; ZVFHMIN64-NEXT: addi sp, sp, -16 -; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN64-NEXT: csrr a0, vlenb -; ZVFHMIN64-NEXT: sub sp, sp, a0 -; ZVFHMIN64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb -; ZVFHMIN64-NEXT: addi a0, sp, 16 -; ZVFHMIN64-NEXT: vsetvli a1, zero, e16, mf4, ta, ma -; ZVFHMIN64-NEXT: vse16.v v8, (a0) -; ZVFHMIN64-NEXT: csrr a1, vlenb -; ZVFHMIN64-NEXT: srli a1, a1, 2 -; ZVFHMIN64-NEXT: add a0, a0, a1 -; ZVFHMIN64-NEXT: vse16.v v9, (a0) -; ZVFHMIN64-NEXT: addi a0, a0, -2 -; ZVFHMIN64-NEXT: vle16.v v8, (a0) -; ZVFHMIN64-NEXT: csrr a0, vlenb -; ZVFHMIN64-NEXT: add sp, sp, a0 -; ZVFHMIN64-NEXT: addi sp, sp, 16 -; ZVFHMIN64-NEXT: ret -; -; ZVFHMIN32-LABEL: splice_nxv1f16_offset_negone: -; ZVFHMIN32: # %bb.0: -; ZVFHMIN32-NEXT: addi sp, sp, -16 -; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN32-NEXT: csrr a0, vlenb -; ZVFHMIN32-NEXT: sub sp, sp, a0 -; ZVFHMIN32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb -; ZVFHMIN32-NEXT: addi a0, sp, 16 -; ZVFHMIN32-NEXT: vsetvli a1, zero, e16, mf4, ta, ma -; ZVFHMIN32-NEXT: vse16.v v8, (a0) -; ZVFHMIN32-NEXT: csrr a1, vlenb -; ZVFHMIN32-NEXT: srli a1, a1, 2 -; ZVFHMIN32-NEXT: add a0, a0, a1 -; ZVFHMIN32-NEXT: vse16.v v9, (a0) -; ZVFHMIN32-NEXT: addi a0, a0, -2 -; ZVFHMIN32-NEXT: vle16.v v8, (a0) -; ZVFHMIN32-NEXT: csrr a0, vlenb -; ZVFHMIN32-NEXT: add sp, sp, a0 -; ZVFHMIN32-NEXT: addi sp, sp, 16 -; ZVFHMIN32-NEXT: ret -; -; ZVFH32-LABEL: splice_nxv1f16_offset_negone: -; ZVFH32: # %bb.0: -; ZVFH32-NEXT: csrr a0, vlenb -; ZVFH32-NEXT: srli a0, a0, 3 -; ZVFH32-NEXT: addi a0, a0, -1 -; ZVFH32-NEXT: vsetvli a1, zero, e16, mf4, ta, ma -; ZVFH32-NEXT: vslidedown.vx v8, v8, a0 -; ZVFH32-NEXT: vslideup.vi v8, v9, 1 -; ZVFH32-NEXT: ret -; -; ZVFH64-LABEL: splice_nxv1f16_offset_negone: -; ZVFH64: # %bb.0: -; ZVFH64-NEXT: csrr a0, vlenb -; ZVFH64-NEXT: srli a0, a0, 3 -; ZVFH64-NEXT: addi a0, a0, -1 -; ZVFH64-NEXT: vsetvli a1, zero, e16, mf4, ta, ma -; ZVFH64-NEXT: vslidedown.vx v8, v8, a0 -; ZVFH64-NEXT: vslideup.vi v8, v9, 1 -; ZVFH64-NEXT: ret +; CHECK-LABEL: splice_nxv1f16_offset_negone: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: ret %res = call @llvm.vector.splice.nxv1f16( %a, %b, i32 -1) ret %res } define @splice_nxv1f16_offset_min( %a, %b) #0 { -; ZVFHMIN64-LABEL: splice_nxv1f16_offset_min: -; ZVFHMIN64: # %bb.0: -; ZVFHMIN64-NEXT: addi sp, sp, -16 -; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN64-NEXT: csrr a0, vlenb -; ZVFHMIN64-NEXT: sub sp, sp, a0 -; ZVFHMIN64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb -; ZVFHMIN64-NEXT: addi a0, sp, 16 -; ZVFHMIN64-NEXT: vsetvli a1, zero, e16, mf4, ta, ma -; ZVFHMIN64-NEXT: vse16.v v8, (a0) -; ZVFHMIN64-NEXT: csrr a1, vlenb -; ZVFHMIN64-NEXT: srli a1, a1, 2 -; ZVFHMIN64-NEXT: add a0, a0, a1 -; ZVFHMIN64-NEXT: li a2, 4 -; ZVFHMIN64-NEXT: vse16.v v9, (a0) -; ZVFHMIN64-NEXT: bltu a1, a2, .LBB128_2 -; ZVFHMIN64-NEXT: # %bb.1: -; ZVFHMIN64-NEXT: li a1, 4 -; ZVFHMIN64-NEXT: .LBB128_2: -; ZVFHMIN64-NEXT: sub a0, a0, a1 -; ZVFHMIN64-NEXT: vle16.v v8, (a0) -; ZVFHMIN64-NEXT: csrr a0, vlenb -; ZVFHMIN64-NEXT: add sp, sp, a0 -; ZVFHMIN64-NEXT: addi sp, sp, 16 -; ZVFHMIN64-NEXT: ret -; -; ZVFHMIN32-LABEL: splice_nxv1f16_offset_min: -; ZVFHMIN32: # %bb.0: -; ZVFHMIN32-NEXT: addi sp, sp, -16 -; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN32-NEXT: csrr a0, vlenb -; ZVFHMIN32-NEXT: sub sp, sp, a0 -; ZVFHMIN32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb -; ZVFHMIN32-NEXT: addi a0, sp, 16 -; ZVFHMIN32-NEXT: vsetvli a1, zero, e16, mf4, ta, ma -; ZVFHMIN32-NEXT: vse16.v v8, (a0) -; ZVFHMIN32-NEXT: csrr a1, vlenb -; ZVFHMIN32-NEXT: srli a1, a1, 2 -; ZVFHMIN32-NEXT: add a0, a0, a1 -; ZVFHMIN32-NEXT: li a2, 4 -; ZVFHMIN32-NEXT: vse16.v v9, (a0) -; ZVFHMIN32-NEXT: bltu a1, a2, .LBB128_2 -; ZVFHMIN32-NEXT: # %bb.1: -; ZVFHMIN32-NEXT: li a1, 4 -; ZVFHMIN32-NEXT: .LBB128_2: -; ZVFHMIN32-NEXT: sub a0, a0, a1 -; ZVFHMIN32-NEXT: vle16.v v8, (a0) -; ZVFHMIN32-NEXT: csrr a0, vlenb -; ZVFHMIN32-NEXT: add sp, sp, a0 -; ZVFHMIN32-NEXT: addi sp, sp, 16 -; ZVFHMIN32-NEXT: ret -; -; ZVFH32-LABEL: splice_nxv1f16_offset_min: -; ZVFH32: # %bb.0: -; ZVFH32-NEXT: csrr a0, vlenb -; ZVFH32-NEXT: srli a0, a0, 3 -; ZVFH32-NEXT: addi a0, a0, -2 -; ZVFH32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; ZVFH32-NEXT: vslidedown.vx v8, v8, a0 -; ZVFH32-NEXT: vsetvli a0, zero, e16, mf4, ta, ma -; ZVFH32-NEXT: vslideup.vi v8, v9, 2 -; ZVFH32-NEXT: ret -; -; ZVFH64-LABEL: splice_nxv1f16_offset_min: -; ZVFH64: # %bb.0: -; ZVFH64-NEXT: csrr a0, vlenb -; ZVFH64-NEXT: srli a0, a0, 3 -; ZVFH64-NEXT: addi a0, a0, -2 -; ZVFH64-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; ZVFH64-NEXT: vslidedown.vx v8, v8, a0 -; ZVFH64-NEXT: vsetvli a0, zero, e16, mf4, ta, ma -; ZVFH64-NEXT: vslideup.vi v8, v9, 2 -; ZVFH64-NEXT: ret +; CHECK-LABEL: splice_nxv1f16_offset_min: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: addi a0, a0, -2 +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vslideup.vi v8, v9, 2 +; CHECK-NEXT: ret %res = call @llvm.vector.splice.nxv1f16( %a, %b, i32 -2) ret %res } define @splice_nxv1f16_offset_max( %a, %b) #0 { -; ZVFHMIN64-LABEL: splice_nxv1f16_offset_max: -; ZVFHMIN64: # %bb.0: -; ZVFHMIN64-NEXT: addi sp, sp, -16 -; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN64-NEXT: csrr a0, vlenb -; ZVFHMIN64-NEXT: sub sp, sp, a0 -; ZVFHMIN64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb -; ZVFHMIN64-NEXT: addi a0, sp, 16 -; ZVFHMIN64-NEXT: vsetvli a1, zero, e16, mf4, ta, ma -; ZVFHMIN64-NEXT: vse16.v v8, (a0) -; ZVFHMIN64-NEXT: csrr a1, vlenb -; ZVFHMIN64-NEXT: srli a2, a1, 2 -; ZVFHMIN64-NEXT: add a2, a0, a2 -; ZVFHMIN64-NEXT: srli a1, a1, 3 -; ZVFHMIN64-NEXT: addi a1, a1, -1 -; ZVFHMIN64-NEXT: li a3, 1 -; ZVFHMIN64-NEXT: vse16.v v9, (a2) -; ZVFHMIN64-NEXT: bltu a1, a3, .LBB129_2 -; ZVFHMIN64-NEXT: # %bb.1: -; ZVFHMIN64-NEXT: li a1, 1 -; ZVFHMIN64-NEXT: .LBB129_2: -; ZVFHMIN64-NEXT: slli a1, a1, 1 -; ZVFHMIN64-NEXT: add a0, a0, a1 -; ZVFHMIN64-NEXT: vle16.v v8, (a0) -; ZVFHMIN64-NEXT: csrr a0, vlenb -; ZVFHMIN64-NEXT: add sp, sp, a0 -; ZVFHMIN64-NEXT: addi sp, sp, 16 -; ZVFHMIN64-NEXT: ret -; -; ZVFHMIN32-LABEL: splice_nxv1f16_offset_max: -; ZVFHMIN32: # %bb.0: -; ZVFHMIN32-NEXT: addi sp, sp, -16 -; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN32-NEXT: csrr a0, vlenb -; ZVFHMIN32-NEXT: sub sp, sp, a0 -; ZVFHMIN32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb -; ZVFHMIN32-NEXT: addi a0, sp, 16 -; ZVFHMIN32-NEXT: vsetvli a1, zero, e16, mf4, ta, ma -; ZVFHMIN32-NEXT: vse16.v v8, (a0) -; ZVFHMIN32-NEXT: csrr a1, vlenb -; ZVFHMIN32-NEXT: srli a2, a1, 2 -; ZVFHMIN32-NEXT: add a2, a0, a2 -; ZVFHMIN32-NEXT: srli a1, a1, 3 -; ZVFHMIN32-NEXT: addi a1, a1, -1 -; ZVFHMIN32-NEXT: li a3, 1 -; ZVFHMIN32-NEXT: vse16.v v9, (a2) -; ZVFHMIN32-NEXT: bltu a1, a3, .LBB129_2 -; ZVFHMIN32-NEXT: # %bb.1: -; ZVFHMIN32-NEXT: li a1, 1 -; ZVFHMIN32-NEXT: .LBB129_2: -; ZVFHMIN32-NEXT: slli a1, a1, 1 -; ZVFHMIN32-NEXT: add a0, a0, a1 -; ZVFHMIN32-NEXT: vle16.v v8, (a0) -; ZVFHMIN32-NEXT: csrr a0, vlenb -; ZVFHMIN32-NEXT: add sp, sp, a0 -; ZVFHMIN32-NEXT: addi sp, sp, 16 -; ZVFHMIN32-NEXT: ret -; -; ZVFH32-LABEL: splice_nxv1f16_offset_max: -; ZVFH32: # %bb.0: -; ZVFH32-NEXT: csrr a0, vlenb -; ZVFH32-NEXT: srli a0, a0, 3 -; ZVFH32-NEXT: addi a0, a0, -1 -; ZVFH32-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; ZVFH32-NEXT: vslidedown.vi v8, v8, 1 -; ZVFH32-NEXT: vsetvli a1, zero, e16, mf4, ta, ma -; ZVFH32-NEXT: vslideup.vx v8, v9, a0 -; ZVFH32-NEXT: ret -; -; ZVFH64-LABEL: splice_nxv1f16_offset_max: -; ZVFH64: # %bb.0: -; ZVFH64-NEXT: csrr a0, vlenb -; ZVFH64-NEXT: srli a0, a0, 3 -; ZVFH64-NEXT: addi a0, a0, -1 -; ZVFH64-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; ZVFH64-NEXT: vslidedown.vi v8, v8, 1 -; ZVFH64-NEXT: vsetvli a1, zero, e16, mf4, ta, ma -; ZVFH64-NEXT: vslideup.vx v8, v9, a0 -; ZVFH64-NEXT: ret +; CHECK-LABEL: splice_nxv1f16_offset_max: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vslideup.vx v8, v9, a0 +; CHECK-NEXT: ret %res = call @llvm.vector.splice.nxv1f16( %a, %b, i32 1) ret %res } @@ -2998,229 +1927,45 @@ define @splice_nxv2f16_offset_zero( %a, < } define @splice_nxv2f16_offset_negone( %a, %b) #0 { -; ZVFHMIN64-LABEL: splice_nxv2f16_offset_negone: -; ZVFHMIN64: # %bb.0: -; ZVFHMIN64-NEXT: addi sp, sp, -16 -; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN64-NEXT: csrr a0, vlenb -; ZVFHMIN64-NEXT: sub sp, sp, a0 -; ZVFHMIN64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb -; ZVFHMIN64-NEXT: addi a0, sp, 16 -; ZVFHMIN64-NEXT: vsetvli a1, zero, e16, mf2, ta, ma -; ZVFHMIN64-NEXT: vse16.v v8, (a0) -; ZVFHMIN64-NEXT: csrr a1, vlenb -; ZVFHMIN64-NEXT: srli a1, a1, 1 -; ZVFHMIN64-NEXT: add a0, a0, a1 -; ZVFHMIN64-NEXT: vse16.v v9, (a0) -; ZVFHMIN64-NEXT: addi a0, a0, -2 -; ZVFHMIN64-NEXT: vle16.v v8, (a0) -; ZVFHMIN64-NEXT: csrr a0, vlenb -; ZVFHMIN64-NEXT: add sp, sp, a0 -; ZVFHMIN64-NEXT: addi sp, sp, 16 -; ZVFHMIN64-NEXT: ret -; -; ZVFHMIN32-LABEL: splice_nxv2f16_offset_negone: -; ZVFHMIN32: # %bb.0: -; ZVFHMIN32-NEXT: addi sp, sp, -16 -; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN32-NEXT: csrr a0, vlenb -; ZVFHMIN32-NEXT: sub sp, sp, a0 -; ZVFHMIN32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb -; ZVFHMIN32-NEXT: addi a0, sp, 16 -; ZVFHMIN32-NEXT: vsetvli a1, zero, e16, mf2, ta, ma -; ZVFHMIN32-NEXT: vse16.v v8, (a0) -; ZVFHMIN32-NEXT: csrr a1, vlenb -; ZVFHMIN32-NEXT: srli a1, a1, 1 -; ZVFHMIN32-NEXT: add a0, a0, a1 -; ZVFHMIN32-NEXT: vse16.v v9, (a0) -; ZVFHMIN32-NEXT: addi a0, a0, -2 -; ZVFHMIN32-NEXT: vle16.v v8, (a0) -; ZVFHMIN32-NEXT: csrr a0, vlenb -; ZVFHMIN32-NEXT: add sp, sp, a0 -; ZVFHMIN32-NEXT: addi sp, sp, 16 -; ZVFHMIN32-NEXT: ret -; -; ZVFH32-LABEL: splice_nxv2f16_offset_negone: -; ZVFH32: # %bb.0: -; ZVFH32-NEXT: csrr a0, vlenb -; ZVFH32-NEXT: srli a0, a0, 2 -; ZVFH32-NEXT: addi a0, a0, -1 -; ZVFH32-NEXT: vsetvli a1, zero, e16, mf2, ta, ma -; ZVFH32-NEXT: vslidedown.vx v8, v8, a0 -; ZVFH32-NEXT: vslideup.vi v8, v9, 1 -; ZVFH32-NEXT: ret -; -; ZVFH64-LABEL: splice_nxv2f16_offset_negone: -; ZVFH64: # %bb.0: -; ZVFH64-NEXT: csrr a0, vlenb -; ZVFH64-NEXT: srli a0, a0, 2 -; ZVFH64-NEXT: addi a0, a0, -1 -; ZVFH64-NEXT: vsetvli a1, zero, e16, mf2, ta, ma -; ZVFH64-NEXT: vslidedown.vx v8, v8, a0 -; ZVFH64-NEXT: vslideup.vi v8, v9, 1 -; ZVFH64-NEXT: ret +; CHECK-LABEL: splice_nxv2f16_offset_negone: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: ret %res = call @llvm.vector.splice.nxv2f16( %a, %b, i32 -1) ret %res } define @splice_nxv2f16_offset_min( %a, %b) #0 { -; ZVFHMIN64-LABEL: splice_nxv2f16_offset_min: -; ZVFHMIN64: # %bb.0: -; ZVFHMIN64-NEXT: addi sp, sp, -16 -; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN64-NEXT: csrr a0, vlenb -; ZVFHMIN64-NEXT: sub sp, sp, a0 -; ZVFHMIN64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb -; ZVFHMIN64-NEXT: addi a0, sp, 16 -; ZVFHMIN64-NEXT: vsetvli a1, zero, e16, mf2, ta, ma -; ZVFHMIN64-NEXT: vse16.v v8, (a0) -; ZVFHMIN64-NEXT: csrr a1, vlenb -; ZVFHMIN64-NEXT: srli a1, a1, 1 -; ZVFHMIN64-NEXT: add a0, a0, a1 -; ZVFHMIN64-NEXT: li a2, 8 -; ZVFHMIN64-NEXT: vse16.v v9, (a0) -; ZVFHMIN64-NEXT: bltu a1, a2, .LBB132_2 -; ZVFHMIN64-NEXT: # %bb.1: -; ZVFHMIN64-NEXT: li a1, 8 -; ZVFHMIN64-NEXT: .LBB132_2: -; ZVFHMIN64-NEXT: sub a0, a0, a1 -; ZVFHMIN64-NEXT: vle16.v v8, (a0) -; ZVFHMIN64-NEXT: csrr a0, vlenb -; ZVFHMIN64-NEXT: add sp, sp, a0 -; ZVFHMIN64-NEXT: addi sp, sp, 16 -; ZVFHMIN64-NEXT: ret -; -; ZVFHMIN32-LABEL: splice_nxv2f16_offset_min: -; ZVFHMIN32: # %bb.0: -; ZVFHMIN32-NEXT: addi sp, sp, -16 -; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN32-NEXT: csrr a0, vlenb -; ZVFHMIN32-NEXT: sub sp, sp, a0 -; ZVFHMIN32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb -; ZVFHMIN32-NEXT: addi a0, sp, 16 -; ZVFHMIN32-NEXT: vsetvli a1, zero, e16, mf2, ta, ma -; ZVFHMIN32-NEXT: vse16.v v8, (a0) -; ZVFHMIN32-NEXT: csrr a1, vlenb -; ZVFHMIN32-NEXT: srli a1, a1, 1 -; ZVFHMIN32-NEXT: add a0, a0, a1 -; ZVFHMIN32-NEXT: li a2, 8 -; ZVFHMIN32-NEXT: vse16.v v9, (a0) -; ZVFHMIN32-NEXT: bltu a1, a2, .LBB132_2 -; ZVFHMIN32-NEXT: # %bb.1: -; ZVFHMIN32-NEXT: li a1, 8 -; ZVFHMIN32-NEXT: .LBB132_2: -; ZVFHMIN32-NEXT: sub a0, a0, a1 -; ZVFHMIN32-NEXT: vle16.v v8, (a0) -; ZVFHMIN32-NEXT: csrr a0, vlenb -; ZVFHMIN32-NEXT: add sp, sp, a0 -; ZVFHMIN32-NEXT: addi sp, sp, 16 -; ZVFHMIN32-NEXT: ret -; -; ZVFH32-LABEL: splice_nxv2f16_offset_min: -; ZVFH32: # %bb.0: -; ZVFH32-NEXT: csrr a0, vlenb -; ZVFH32-NEXT: srli a0, a0, 2 -; ZVFH32-NEXT: addi a0, a0, -4 -; ZVFH32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZVFH32-NEXT: vslidedown.vx v8, v8, a0 -; ZVFH32-NEXT: vsetvli a0, zero, e16, mf2, ta, ma -; ZVFH32-NEXT: vslideup.vi v8, v9, 4 -; ZVFH32-NEXT: ret -; -; ZVFH64-LABEL: splice_nxv2f16_offset_min: -; ZVFH64: # %bb.0: -; ZVFH64-NEXT: csrr a0, vlenb -; ZVFH64-NEXT: srli a0, a0, 2 -; ZVFH64-NEXT: addi a0, a0, -4 -; ZVFH64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZVFH64-NEXT: vslidedown.vx v8, v8, a0 -; ZVFH64-NEXT: vsetvli a0, zero, e16, mf2, ta, ma -; ZVFH64-NEXT: vslideup.vi v8, v9, 4 -; ZVFH64-NEXT: ret +; CHECK-LABEL: splice_nxv2f16_offset_min: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: addi a0, a0, -4 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v9, 4 +; CHECK-NEXT: ret %res = call @llvm.vector.splice.nxv2f16( %a, %b, i32 -4) ret %res } define @splice_nxv2f16_offset_max( %a, %b) #0 { -; ZVFHMIN64-LABEL: splice_nxv2f16_offset_max: -; ZVFHMIN64: # %bb.0: -; ZVFHMIN64-NEXT: addi sp, sp, -16 -; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN64-NEXT: csrr a0, vlenb -; ZVFHMIN64-NEXT: sub sp, sp, a0 -; ZVFHMIN64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb -; ZVFHMIN64-NEXT: addi a0, sp, 16 -; ZVFHMIN64-NEXT: vsetvli a1, zero, e16, mf2, ta, ma -; ZVFHMIN64-NEXT: vse16.v v8, (a0) -; ZVFHMIN64-NEXT: csrr a1, vlenb -; ZVFHMIN64-NEXT: srli a2, a1, 1 -; ZVFHMIN64-NEXT: add a2, a0, a2 -; ZVFHMIN64-NEXT: srli a1, a1, 2 -; ZVFHMIN64-NEXT: addi a1, a1, -1 -; ZVFHMIN64-NEXT: li a3, 3 -; ZVFHMIN64-NEXT: vse16.v v9, (a2) -; ZVFHMIN64-NEXT: bltu a1, a3, .LBB133_2 -; ZVFHMIN64-NEXT: # %bb.1: -; ZVFHMIN64-NEXT: li a1, 3 -; ZVFHMIN64-NEXT: .LBB133_2: -; ZVFHMIN64-NEXT: slli a1, a1, 1 -; ZVFHMIN64-NEXT: add a0, a0, a1 -; ZVFHMIN64-NEXT: vle16.v v8, (a0) -; ZVFHMIN64-NEXT: csrr a0, vlenb -; ZVFHMIN64-NEXT: add sp, sp, a0 -; ZVFHMIN64-NEXT: addi sp, sp, 16 -; ZVFHMIN64-NEXT: ret -; -; ZVFHMIN32-LABEL: splice_nxv2f16_offset_max: -; ZVFHMIN32: # %bb.0: -; ZVFHMIN32-NEXT: addi sp, sp, -16 -; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN32-NEXT: csrr a0, vlenb -; ZVFHMIN32-NEXT: sub sp, sp, a0 -; ZVFHMIN32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb -; ZVFHMIN32-NEXT: addi a0, sp, 16 -; ZVFHMIN32-NEXT: vsetvli a1, zero, e16, mf2, ta, ma -; ZVFHMIN32-NEXT: vse16.v v8, (a0) -; ZVFHMIN32-NEXT: csrr a1, vlenb -; ZVFHMIN32-NEXT: srli a2, a1, 1 -; ZVFHMIN32-NEXT: add a2, a0, a2 -; ZVFHMIN32-NEXT: srli a1, a1, 2 -; ZVFHMIN32-NEXT: addi a1, a1, -1 -; ZVFHMIN32-NEXT: li a3, 3 -; ZVFHMIN32-NEXT: vse16.v v9, (a2) -; ZVFHMIN32-NEXT: bltu a1, a3, .LBB133_2 -; ZVFHMIN32-NEXT: # %bb.1: -; ZVFHMIN32-NEXT: li a1, 3 -; ZVFHMIN32-NEXT: .LBB133_2: -; ZVFHMIN32-NEXT: slli a1, a1, 1 -; ZVFHMIN32-NEXT: add a0, a0, a1 -; ZVFHMIN32-NEXT: vle16.v v8, (a0) -; ZVFHMIN32-NEXT: csrr a0, vlenb -; ZVFHMIN32-NEXT: add sp, sp, a0 -; ZVFHMIN32-NEXT: addi sp, sp, 16 -; ZVFHMIN32-NEXT: ret -; -; ZVFH32-LABEL: splice_nxv2f16_offset_max: -; ZVFH32: # %bb.0: -; ZVFH32-NEXT: csrr a0, vlenb -; ZVFH32-NEXT: srli a0, a0, 2 -; ZVFH32-NEXT: addi a0, a0, -3 -; ZVFH32-NEXT: vsetvli zero, a0, e16, mf2, ta, ma -; ZVFH32-NEXT: vslidedown.vi v8, v8, 3 -; ZVFH32-NEXT: vsetvli a1, zero, e16, mf2, ta, ma -; ZVFH32-NEXT: vslideup.vx v8, v9, a0 -; ZVFH32-NEXT: ret -; -; ZVFH64-LABEL: splice_nxv2f16_offset_max: -; ZVFH64: # %bb.0: -; ZVFH64-NEXT: csrr a0, vlenb -; ZVFH64-NEXT: srli a0, a0, 2 -; ZVFH64-NEXT: addi a0, a0, -3 -; ZVFH64-NEXT: vsetvli zero, a0, e16, mf2, ta, ma -; ZVFH64-NEXT: vslidedown.vi v8, v8, 3 -; ZVFH64-NEXT: vsetvli a1, zero, e16, mf2, ta, ma -; ZVFH64-NEXT: vslideup.vx v8, v9, a0 -; ZVFH64-NEXT: ret +; CHECK-LABEL: splice_nxv2f16_offset_max: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: addi a0, a0, -3 +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vslidedown.vi v8, v8, 3 +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vslideup.vx v8, v9, a0 +; CHECK-NEXT: ret %res = call @llvm.vector.splice.nxv2f16( %a, %b, i32 3) ret %res } @@ -3236,229 +1981,45 @@ define @splice_nxv4f16_offset_zero( %a, < } define @splice_nxv4f16_offset_negone( %a, %b) #0 { -; ZVFHMIN64-LABEL: splice_nxv4f16_offset_negone: -; ZVFHMIN64: # %bb.0: -; ZVFHMIN64-NEXT: addi sp, sp, -16 -; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN64-NEXT: csrr a0, vlenb -; ZVFHMIN64-NEXT: slli a0, a0, 1 -; ZVFHMIN64-NEXT: sub sp, sp, a0 -; ZVFHMIN64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb -; ZVFHMIN64-NEXT: addi a0, sp, 16 -; ZVFHMIN64-NEXT: vs1r.v v8, (a0) -; ZVFHMIN64-NEXT: csrr a1, vlenb -; ZVFHMIN64-NEXT: add a0, a0, a1 -; ZVFHMIN64-NEXT: vs1r.v v9, (a0) -; ZVFHMIN64-NEXT: addi a0, a0, -2 -; ZVFHMIN64-NEXT: vl1re16.v v8, (a0) -; ZVFHMIN64-NEXT: csrr a0, vlenb -; ZVFHMIN64-NEXT: slli a0, a0, 1 -; ZVFHMIN64-NEXT: add sp, sp, a0 -; ZVFHMIN64-NEXT: addi sp, sp, 16 -; ZVFHMIN64-NEXT: ret -; -; ZVFHMIN32-LABEL: splice_nxv4f16_offset_negone: -; ZVFHMIN32: # %bb.0: -; ZVFHMIN32-NEXT: addi sp, sp, -16 -; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN32-NEXT: csrr a0, vlenb -; ZVFHMIN32-NEXT: slli a0, a0, 1 -; ZVFHMIN32-NEXT: sub sp, sp, a0 -; ZVFHMIN32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb -; ZVFHMIN32-NEXT: addi a0, sp, 16 -; ZVFHMIN32-NEXT: vs1r.v v8, (a0) -; ZVFHMIN32-NEXT: csrr a1, vlenb -; ZVFHMIN32-NEXT: add a0, a0, a1 -; ZVFHMIN32-NEXT: vs1r.v v9, (a0) -; ZVFHMIN32-NEXT: addi a0, a0, -2 -; ZVFHMIN32-NEXT: vl1re16.v v8, (a0) -; ZVFHMIN32-NEXT: csrr a0, vlenb -; ZVFHMIN32-NEXT: slli a0, a0, 1 -; ZVFHMIN32-NEXT: add sp, sp, a0 -; ZVFHMIN32-NEXT: addi sp, sp, 16 -; ZVFHMIN32-NEXT: ret -; -; ZVFH32-LABEL: splice_nxv4f16_offset_negone: -; ZVFH32: # %bb.0: -; ZVFH32-NEXT: csrr a0, vlenb -; ZVFH32-NEXT: srli a0, a0, 1 -; ZVFH32-NEXT: addi a0, a0, -1 -; ZVFH32-NEXT: vsetvli a1, zero, e16, m1, ta, ma -; ZVFH32-NEXT: vslidedown.vx v8, v8, a0 -; ZVFH32-NEXT: vslideup.vi v8, v9, 1 -; ZVFH32-NEXT: ret -; -; ZVFH64-LABEL: splice_nxv4f16_offset_negone: -; ZVFH64: # %bb.0: -; ZVFH64-NEXT: csrr a0, vlenb -; ZVFH64-NEXT: srli a0, a0, 1 -; ZVFH64-NEXT: addi a0, a0, -1 -; ZVFH64-NEXT: vsetvli a1, zero, e16, m1, ta, ma -; ZVFH64-NEXT: vslidedown.vx v8, v8, a0 -; ZVFH64-NEXT: vslideup.vi v8, v9, 1 -; ZVFH64-NEXT: ret +; CHECK-LABEL: splice_nxv4f16_offset_negone: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 1 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: ret %res = call @llvm.vector.splice.nxv4f16( %a, %b, i32 -1) ret %res } define @splice_nxv4f16_offset_min( %a, %b) #0 { -; ZVFHMIN64-LABEL: splice_nxv4f16_offset_min: -; ZVFHMIN64: # %bb.0: -; ZVFHMIN64-NEXT: addi sp, sp, -16 -; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN64-NEXT: csrr a0, vlenb -; ZVFHMIN64-NEXT: slli a0, a0, 1 -; ZVFHMIN64-NEXT: sub sp, sp, a0 -; ZVFHMIN64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb -; ZVFHMIN64-NEXT: addi a0, sp, 16 -; ZVFHMIN64-NEXT: vs1r.v v8, (a0) -; ZVFHMIN64-NEXT: csrr a1, vlenb -; ZVFHMIN64-NEXT: add a0, a0, a1 -; ZVFHMIN64-NEXT: li a2, 16 -; ZVFHMIN64-NEXT: vs1r.v v9, (a0) -; ZVFHMIN64-NEXT: bltu a1, a2, .LBB136_2 -; ZVFHMIN64-NEXT: # %bb.1: -; ZVFHMIN64-NEXT: li a1, 16 -; ZVFHMIN64-NEXT: .LBB136_2: -; ZVFHMIN64-NEXT: sub a0, a0, a1 -; ZVFHMIN64-NEXT: vl1re16.v v8, (a0) -; ZVFHMIN64-NEXT: csrr a0, vlenb -; ZVFHMIN64-NEXT: slli a0, a0, 1 -; ZVFHMIN64-NEXT: add sp, sp, a0 -; ZVFHMIN64-NEXT: addi sp, sp, 16 -; ZVFHMIN64-NEXT: ret -; -; ZVFHMIN32-LABEL: splice_nxv4f16_offset_min: -; ZVFHMIN32: # %bb.0: -; ZVFHMIN32-NEXT: addi sp, sp, -16 -; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN32-NEXT: csrr a0, vlenb -; ZVFHMIN32-NEXT: slli a0, a0, 1 -; ZVFHMIN32-NEXT: sub sp, sp, a0 -; ZVFHMIN32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb -; ZVFHMIN32-NEXT: addi a0, sp, 16 -; ZVFHMIN32-NEXT: vs1r.v v8, (a0) -; ZVFHMIN32-NEXT: csrr a1, vlenb -; ZVFHMIN32-NEXT: add a0, a0, a1 -; ZVFHMIN32-NEXT: li a2, 16 -; ZVFHMIN32-NEXT: vs1r.v v9, (a0) -; ZVFHMIN32-NEXT: bltu a1, a2, .LBB136_2 -; ZVFHMIN32-NEXT: # %bb.1: -; ZVFHMIN32-NEXT: li a1, 16 -; ZVFHMIN32-NEXT: .LBB136_2: -; ZVFHMIN32-NEXT: sub a0, a0, a1 -; ZVFHMIN32-NEXT: vl1re16.v v8, (a0) -; ZVFHMIN32-NEXT: csrr a0, vlenb -; ZVFHMIN32-NEXT: slli a0, a0, 1 -; ZVFHMIN32-NEXT: add sp, sp, a0 -; ZVFHMIN32-NEXT: addi sp, sp, 16 -; ZVFHMIN32-NEXT: ret -; -; ZVFH32-LABEL: splice_nxv4f16_offset_min: -; ZVFH32: # %bb.0: -; ZVFH32-NEXT: csrr a0, vlenb -; ZVFH32-NEXT: srli a0, a0, 1 -; ZVFH32-NEXT: addi a0, a0, -8 -; ZVFH32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVFH32-NEXT: vslidedown.vx v8, v8, a0 -; ZVFH32-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; ZVFH32-NEXT: vslideup.vi v8, v9, 8 -; ZVFH32-NEXT: ret -; -; ZVFH64-LABEL: splice_nxv4f16_offset_min: -; ZVFH64: # %bb.0: -; ZVFH64-NEXT: csrr a0, vlenb -; ZVFH64-NEXT: srli a0, a0, 1 -; ZVFH64-NEXT: addi a0, a0, -8 -; ZVFH64-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVFH64-NEXT: vslidedown.vx v8, v8, a0 -; ZVFH64-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; ZVFH64-NEXT: vslideup.vi v8, v9, 8 -; ZVFH64-NEXT: ret +; CHECK-LABEL: splice_nxv4f16_offset_min: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 1 +; CHECK-NEXT: addi a0, a0, -8 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vi v8, v9, 8 +; CHECK-NEXT: ret %res = call @llvm.vector.splice.nxv4f16( %a, %b, i32 -8) ret %res } define @splice_nxv4f16_offset_max( %a, %b) #0 { -; ZVFHMIN64-LABEL: splice_nxv4f16_offset_max: -; ZVFHMIN64: # %bb.0: -; ZVFHMIN64-NEXT: addi sp, sp, -16 -; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN64-NEXT: csrr a0, vlenb -; ZVFHMIN64-NEXT: slli a0, a0, 1 -; ZVFHMIN64-NEXT: sub sp, sp, a0 -; ZVFHMIN64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb -; ZVFHMIN64-NEXT: addi a0, sp, 16 -; ZVFHMIN64-NEXT: vs1r.v v8, (a0) -; ZVFHMIN64-NEXT: csrr a1, vlenb -; ZVFHMIN64-NEXT: add a2, a0, a1 -; ZVFHMIN64-NEXT: srli a1, a1, 1 -; ZVFHMIN64-NEXT: addi a1, a1, -1 -; ZVFHMIN64-NEXT: li a3, 7 -; ZVFHMIN64-NEXT: vs1r.v v9, (a2) -; ZVFHMIN64-NEXT: bltu a1, a3, .LBB137_2 -; ZVFHMIN64-NEXT: # %bb.1: -; ZVFHMIN64-NEXT: li a1, 7 -; ZVFHMIN64-NEXT: .LBB137_2: -; ZVFHMIN64-NEXT: slli a1, a1, 1 -; ZVFHMIN64-NEXT: add a0, a0, a1 -; ZVFHMIN64-NEXT: vl1re16.v v8, (a0) -; ZVFHMIN64-NEXT: csrr a0, vlenb -; ZVFHMIN64-NEXT: slli a0, a0, 1 -; ZVFHMIN64-NEXT: add sp, sp, a0 -; ZVFHMIN64-NEXT: addi sp, sp, 16 -; ZVFHMIN64-NEXT: ret -; -; ZVFHMIN32-LABEL: splice_nxv4f16_offset_max: -; ZVFHMIN32: # %bb.0: -; ZVFHMIN32-NEXT: addi sp, sp, -16 -; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN32-NEXT: csrr a0, vlenb -; ZVFHMIN32-NEXT: slli a0, a0, 1 -; ZVFHMIN32-NEXT: sub sp, sp, a0 -; ZVFHMIN32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb -; ZVFHMIN32-NEXT: addi a0, sp, 16 -; ZVFHMIN32-NEXT: vs1r.v v8, (a0) -; ZVFHMIN32-NEXT: csrr a1, vlenb -; ZVFHMIN32-NEXT: add a2, a0, a1 -; ZVFHMIN32-NEXT: srli a1, a1, 1 -; ZVFHMIN32-NEXT: addi a1, a1, -1 -; ZVFHMIN32-NEXT: li a3, 7 -; ZVFHMIN32-NEXT: vs1r.v v9, (a2) -; ZVFHMIN32-NEXT: bltu a1, a3, .LBB137_2 -; ZVFHMIN32-NEXT: # %bb.1: -; ZVFHMIN32-NEXT: li a1, 7 -; ZVFHMIN32-NEXT: .LBB137_2: -; ZVFHMIN32-NEXT: slli a1, a1, 1 -; ZVFHMIN32-NEXT: add a0, a0, a1 -; ZVFHMIN32-NEXT: vl1re16.v v8, (a0) -; ZVFHMIN32-NEXT: csrr a0, vlenb -; ZVFHMIN32-NEXT: slli a0, a0, 1 -; ZVFHMIN32-NEXT: add sp, sp, a0 -; ZVFHMIN32-NEXT: addi sp, sp, 16 -; ZVFHMIN32-NEXT: ret -; -; ZVFH32-LABEL: splice_nxv4f16_offset_max: -; ZVFH32: # %bb.0: -; ZVFH32-NEXT: csrr a0, vlenb -; ZVFH32-NEXT: srli a0, a0, 1 -; ZVFH32-NEXT: addi a0, a0, -7 -; ZVFH32-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; ZVFH32-NEXT: vslidedown.vi v8, v8, 7 -; ZVFH32-NEXT: vsetvli a1, zero, e16, m1, ta, ma -; ZVFH32-NEXT: vslideup.vx v8, v9, a0 -; ZVFH32-NEXT: ret -; -; ZVFH64-LABEL: splice_nxv4f16_offset_max: -; ZVFH64: # %bb.0: -; ZVFH64-NEXT: csrr a0, vlenb -; ZVFH64-NEXT: srli a0, a0, 1 -; ZVFH64-NEXT: addi a0, a0, -7 -; ZVFH64-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; ZVFH64-NEXT: vslidedown.vi v8, v8, 7 -; ZVFH64-NEXT: vsetvli a1, zero, e16, m1, ta, ma -; ZVFH64-NEXT: vslideup.vx v8, v9, a0 -; ZVFH64-NEXT: ret +; CHECK-LABEL: splice_nxv4f16_offset_max: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 1 +; CHECK-NEXT: addi a0, a0, -7 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vslidedown.vi v8, v8, 7 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vx v8, v9, a0 +; CHECK-NEXT: ret %res = call @llvm.vector.splice.nxv4f16( %a, %b, i32 7) ret %res } @@ -3474,229 +2035,43 @@ define @splice_nxv8f16_offset_zero( %a, < } define @splice_nxv8f16_offset_negone( %a, %b) #0 { -; ZVFHMIN64-LABEL: splice_nxv8f16_offset_negone: -; ZVFHMIN64: # %bb.0: -; ZVFHMIN64-NEXT: addi sp, sp, -16 -; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN64-NEXT: csrr a0, vlenb -; ZVFHMIN64-NEXT: slli a0, a0, 2 -; ZVFHMIN64-NEXT: sub sp, sp, a0 -; ZVFHMIN64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb -; ZVFHMIN64-NEXT: addi a0, sp, 16 -; ZVFHMIN64-NEXT: vs2r.v v8, (a0) -; ZVFHMIN64-NEXT: csrr a1, vlenb -; ZVFHMIN64-NEXT: slli a1, a1, 1 -; ZVFHMIN64-NEXT: add a0, a0, a1 -; ZVFHMIN64-NEXT: vs2r.v v10, (a0) -; ZVFHMIN64-NEXT: addi a0, a0, -2 -; ZVFHMIN64-NEXT: vl2re16.v v8, (a0) -; ZVFHMIN64-NEXT: csrr a0, vlenb -; ZVFHMIN64-NEXT: slli a0, a0, 2 -; ZVFHMIN64-NEXT: add sp, sp, a0 -; ZVFHMIN64-NEXT: addi sp, sp, 16 -; ZVFHMIN64-NEXT: ret -; -; ZVFHMIN32-LABEL: splice_nxv8f16_offset_negone: -; ZVFHMIN32: # %bb.0: -; ZVFHMIN32-NEXT: addi sp, sp, -16 -; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN32-NEXT: csrr a0, vlenb -; ZVFHMIN32-NEXT: slli a0, a0, 2 -; ZVFHMIN32-NEXT: sub sp, sp, a0 -; ZVFHMIN32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb -; ZVFHMIN32-NEXT: addi a0, sp, 16 -; ZVFHMIN32-NEXT: vs2r.v v8, (a0) -; ZVFHMIN32-NEXT: csrr a1, vlenb -; ZVFHMIN32-NEXT: slli a1, a1, 1 -; ZVFHMIN32-NEXT: add a0, a0, a1 -; ZVFHMIN32-NEXT: vs2r.v v10, (a0) -; ZVFHMIN32-NEXT: addi a0, a0, -2 -; ZVFHMIN32-NEXT: vl2re16.v v8, (a0) -; ZVFHMIN32-NEXT: csrr a0, vlenb -; ZVFHMIN32-NEXT: slli a0, a0, 2 -; ZVFHMIN32-NEXT: add sp, sp, a0 -; ZVFHMIN32-NEXT: addi sp, sp, 16 -; ZVFHMIN32-NEXT: ret -; -; ZVFH32-LABEL: splice_nxv8f16_offset_negone: -; ZVFH32: # %bb.0: -; ZVFH32-NEXT: csrr a0, vlenb -; ZVFH32-NEXT: addi a0, a0, -1 -; ZVFH32-NEXT: vsetivli zero, 1, e16, m2, ta, ma -; ZVFH32-NEXT: vslidedown.vx v8, v8, a0 -; ZVFH32-NEXT: vsetvli a0, zero, e16, m2, ta, ma -; ZVFH32-NEXT: vslideup.vi v8, v10, 1 -; ZVFH32-NEXT: ret -; -; ZVFH64-LABEL: splice_nxv8f16_offset_negone: -; ZVFH64: # %bb.0: -; ZVFH64-NEXT: csrr a0, vlenb -; ZVFH64-NEXT: addi a0, a0, -1 -; ZVFH64-NEXT: vsetivli zero, 1, e16, m2, ta, ma -; ZVFH64-NEXT: vslidedown.vx v8, v8, a0 -; ZVFH64-NEXT: vsetvli a0, zero, e16, m2, ta, ma -; ZVFH64-NEXT: vslideup.vi v8, v10, 1 -; ZVFH64-NEXT: ret +; CHECK-LABEL: splice_nxv8f16_offset_negone: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetivli zero, 1, e16, m2, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v10, 1 +; CHECK-NEXT: ret %res = call @llvm.vector.splice.nxv8f16( %a, %b, i32 -1) ret %res } define @splice_nxv8f16_offset_min( %a, %b) #0 { -; ZVFHMIN64-LABEL: splice_nxv8f16_offset_min: -; ZVFHMIN64: # %bb.0: -; ZVFHMIN64-NEXT: addi sp, sp, -16 -; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN64-NEXT: csrr a0, vlenb -; ZVFHMIN64-NEXT: slli a0, a0, 2 -; ZVFHMIN64-NEXT: sub sp, sp, a0 -; ZVFHMIN64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb -; ZVFHMIN64-NEXT: addi a0, sp, 16 -; ZVFHMIN64-NEXT: vs2r.v v8, (a0) -; ZVFHMIN64-NEXT: csrr a1, vlenb -; ZVFHMIN64-NEXT: slli a1, a1, 1 -; ZVFHMIN64-NEXT: add a0, a0, a1 -; ZVFHMIN64-NEXT: li a2, 32 -; ZVFHMIN64-NEXT: vs2r.v v10, (a0) -; ZVFHMIN64-NEXT: bltu a1, a2, .LBB140_2 -; ZVFHMIN64-NEXT: # %bb.1: -; ZVFHMIN64-NEXT: li a1, 32 -; ZVFHMIN64-NEXT: .LBB140_2: -; ZVFHMIN64-NEXT: sub a0, a0, a1 -; ZVFHMIN64-NEXT: vl2re16.v v8, (a0) -; ZVFHMIN64-NEXT: csrr a0, vlenb -; ZVFHMIN64-NEXT: slli a0, a0, 2 -; ZVFHMIN64-NEXT: add sp, sp, a0 -; ZVFHMIN64-NEXT: addi sp, sp, 16 -; ZVFHMIN64-NEXT: ret -; -; ZVFHMIN32-LABEL: splice_nxv8f16_offset_min: -; ZVFHMIN32: # %bb.0: -; ZVFHMIN32-NEXT: addi sp, sp, -16 -; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN32-NEXT: csrr a0, vlenb -; ZVFHMIN32-NEXT: slli a0, a0, 2 -; ZVFHMIN32-NEXT: sub sp, sp, a0 -; ZVFHMIN32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb -; ZVFHMIN32-NEXT: addi a0, sp, 16 -; ZVFHMIN32-NEXT: vs2r.v v8, (a0) -; ZVFHMIN32-NEXT: csrr a1, vlenb -; ZVFHMIN32-NEXT: slli a1, a1, 1 -; ZVFHMIN32-NEXT: add a0, a0, a1 -; ZVFHMIN32-NEXT: li a2, 32 -; ZVFHMIN32-NEXT: vs2r.v v10, (a0) -; ZVFHMIN32-NEXT: bltu a1, a2, .LBB140_2 -; ZVFHMIN32-NEXT: # %bb.1: -; ZVFHMIN32-NEXT: li a1, 32 -; ZVFHMIN32-NEXT: .LBB140_2: -; ZVFHMIN32-NEXT: sub a0, a0, a1 -; ZVFHMIN32-NEXT: vl2re16.v v8, (a0) -; ZVFHMIN32-NEXT: csrr a0, vlenb -; ZVFHMIN32-NEXT: slli a0, a0, 2 -; ZVFHMIN32-NEXT: add sp, sp, a0 -; ZVFHMIN32-NEXT: addi sp, sp, 16 -; ZVFHMIN32-NEXT: ret -; -; ZVFH32-LABEL: splice_nxv8f16_offset_min: -; ZVFH32: # %bb.0: -; ZVFH32-NEXT: csrr a0, vlenb -; ZVFH32-NEXT: addi a0, a0, -16 -; ZVFH32-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; ZVFH32-NEXT: vslidedown.vx v8, v8, a0 -; ZVFH32-NEXT: vsetvli a0, zero, e16, m2, ta, ma -; ZVFH32-NEXT: vslideup.vi v8, v10, 16 -; ZVFH32-NEXT: ret -; -; ZVFH64-LABEL: splice_nxv8f16_offset_min: -; ZVFH64: # %bb.0: -; ZVFH64-NEXT: csrr a0, vlenb -; ZVFH64-NEXT: addi a0, a0, -16 -; ZVFH64-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; ZVFH64-NEXT: vslidedown.vx v8, v8, a0 -; ZVFH64-NEXT: vsetvli a0, zero, e16, m2, ta, ma -; ZVFH64-NEXT: vslideup.vi v8, v10, 16 -; ZVFH64-NEXT: ret +; CHECK-LABEL: splice_nxv8f16_offset_min: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: addi a0, a0, -16 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v10, 16 +; CHECK-NEXT: ret %res = call @llvm.vector.splice.nxv8f16( %a, %b, i32 -16) ret %res } define @splice_nxv8f16_offset_max( %a, %b) #0 { -; ZVFHMIN64-LABEL: splice_nxv8f16_offset_max: -; ZVFHMIN64: # %bb.0: -; ZVFHMIN64-NEXT: addi sp, sp, -16 -; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN64-NEXT: csrr a0, vlenb -; ZVFHMIN64-NEXT: slli a0, a0, 2 -; ZVFHMIN64-NEXT: sub sp, sp, a0 -; ZVFHMIN64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb -; ZVFHMIN64-NEXT: addi a0, sp, 16 -; ZVFHMIN64-NEXT: vs2r.v v8, (a0) -; ZVFHMIN64-NEXT: csrr a1, vlenb -; ZVFHMIN64-NEXT: slli a2, a1, 1 -; ZVFHMIN64-NEXT: add a2, a0, a2 -; ZVFHMIN64-NEXT: addi a1, a1, -1 -; ZVFHMIN64-NEXT: li a3, 15 -; ZVFHMIN64-NEXT: vs2r.v v10, (a2) -; ZVFHMIN64-NEXT: bltu a1, a3, .LBB141_2 -; ZVFHMIN64-NEXT: # %bb.1: -; ZVFHMIN64-NEXT: li a1, 15 -; ZVFHMIN64-NEXT: .LBB141_2: -; ZVFHMIN64-NEXT: slli a1, a1, 1 -; ZVFHMIN64-NEXT: add a0, a0, a1 -; ZVFHMIN64-NEXT: vl2re16.v v8, (a0) -; ZVFHMIN64-NEXT: csrr a0, vlenb -; ZVFHMIN64-NEXT: slli a0, a0, 2 -; ZVFHMIN64-NEXT: add sp, sp, a0 -; ZVFHMIN64-NEXT: addi sp, sp, 16 -; ZVFHMIN64-NEXT: ret -; -; ZVFHMIN32-LABEL: splice_nxv8f16_offset_max: -; ZVFHMIN32: # %bb.0: -; ZVFHMIN32-NEXT: addi sp, sp, -16 -; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN32-NEXT: csrr a0, vlenb -; ZVFHMIN32-NEXT: slli a0, a0, 2 -; ZVFHMIN32-NEXT: sub sp, sp, a0 -; ZVFHMIN32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb -; ZVFHMIN32-NEXT: addi a0, sp, 16 -; ZVFHMIN32-NEXT: vs2r.v v8, (a0) -; ZVFHMIN32-NEXT: csrr a1, vlenb -; ZVFHMIN32-NEXT: slli a2, a1, 1 -; ZVFHMIN32-NEXT: add a2, a0, a2 -; ZVFHMIN32-NEXT: addi a1, a1, -1 -; ZVFHMIN32-NEXT: li a3, 15 -; ZVFHMIN32-NEXT: vs2r.v v10, (a2) -; ZVFHMIN32-NEXT: bltu a1, a3, .LBB141_2 -; ZVFHMIN32-NEXT: # %bb.1: -; ZVFHMIN32-NEXT: li a1, 15 -; ZVFHMIN32-NEXT: .LBB141_2: -; ZVFHMIN32-NEXT: slli a1, a1, 1 -; ZVFHMIN32-NEXT: add a0, a0, a1 -; ZVFHMIN32-NEXT: vl2re16.v v8, (a0) -; ZVFHMIN32-NEXT: csrr a0, vlenb -; ZVFHMIN32-NEXT: slli a0, a0, 2 -; ZVFHMIN32-NEXT: add sp, sp, a0 -; ZVFHMIN32-NEXT: addi sp, sp, 16 -; ZVFHMIN32-NEXT: ret -; -; ZVFH32-LABEL: splice_nxv8f16_offset_max: -; ZVFH32: # %bb.0: -; ZVFH32-NEXT: csrr a0, vlenb -; ZVFH32-NEXT: addi a0, a0, -15 -; ZVFH32-NEXT: vsetvli zero, a0, e16, m2, ta, ma -; ZVFH32-NEXT: vslidedown.vi v8, v8, 15 -; ZVFH32-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; ZVFH32-NEXT: vslideup.vx v8, v10, a0 -; ZVFH32-NEXT: ret -; -; ZVFH64-LABEL: splice_nxv8f16_offset_max: -; ZVFH64: # %bb.0: -; ZVFH64-NEXT: csrr a0, vlenb -; ZVFH64-NEXT: addi a0, a0, -15 -; ZVFH64-NEXT: vsetvli zero, a0, e16, m2, ta, ma -; ZVFH64-NEXT: vslidedown.vi v8, v8, 15 -; ZVFH64-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; ZVFH64-NEXT: vslideup.vx v8, v10, a0 -; ZVFH64-NEXT: ret +; CHECK-LABEL: splice_nxv8f16_offset_max: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: addi a0, a0, -15 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vslidedown.vi v8, v8, 15 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vslideup.vx v8, v10, a0 +; CHECK-NEXT: ret %res = call @llvm.vector.splice.nxv8f16( %a, %b, i32 15) ret %res } @@ -3712,275 +2087,47 @@ define @splice_nxv16f16_offset_zero( %a } define @splice_nxv16f16_offset_negone( %a, %b) #0 { -; ZVFHMIN64-LABEL: splice_nxv16f16_offset_negone: -; ZVFHMIN64: # %bb.0: -; ZVFHMIN64-NEXT: addi sp, sp, -48 -; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 48 -; ZVFHMIN64-NEXT: sw ra, 44(sp) # 4-byte Folded Spill -; ZVFHMIN64-NEXT: sw s0, 40(sp) # 4-byte Folded Spill -; ZVFHMIN64-NEXT: .cfi_offset ra, -4 -; ZVFHMIN64-NEXT: .cfi_offset s0, -8 -; ZVFHMIN64-NEXT: addi s0, sp, 48 -; ZVFHMIN64-NEXT: .cfi_def_cfa s0, 0 -; ZVFHMIN64-NEXT: csrr a0, vlenb -; ZVFHMIN64-NEXT: slli a0, a0, 3 -; ZVFHMIN64-NEXT: sub sp, sp, a0 -; ZVFHMIN64-NEXT: andi sp, sp, -32 -; ZVFHMIN64-NEXT: addi a0, sp, 32 -; ZVFHMIN64-NEXT: vs4r.v v8, (a0) -; ZVFHMIN64-NEXT: csrr a1, vlenb -; ZVFHMIN64-NEXT: slli a1, a1, 2 -; ZVFHMIN64-NEXT: add a0, a0, a1 -; ZVFHMIN64-NEXT: vs4r.v v12, (a0) -; ZVFHMIN64-NEXT: addi a0, a0, -2 -; ZVFHMIN64-NEXT: vl4re16.v v8, (a0) -; ZVFHMIN64-NEXT: addi sp, s0, -48 -; ZVFHMIN64-NEXT: lw ra, 44(sp) # 4-byte Folded Reload -; ZVFHMIN64-NEXT: lw s0, 40(sp) # 4-byte Folded Reload -; ZVFHMIN64-NEXT: addi sp, sp, 48 -; ZVFHMIN64-NEXT: ret -; -; ZVFHMIN32-LABEL: splice_nxv16f16_offset_negone: -; ZVFHMIN32: # %bb.0: -; ZVFHMIN32-NEXT: addi sp, sp, -48 -; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 48 -; ZVFHMIN32-NEXT: sd ra, 40(sp) # 8-byte Folded Spill -; ZVFHMIN32-NEXT: sd s0, 32(sp) # 8-byte Folded Spill -; ZVFHMIN32-NEXT: .cfi_offset ra, -8 -; ZVFHMIN32-NEXT: .cfi_offset s0, -16 -; ZVFHMIN32-NEXT: addi s0, sp, 48 -; ZVFHMIN32-NEXT: .cfi_def_cfa s0, 0 -; ZVFHMIN32-NEXT: csrr a0, vlenb -; ZVFHMIN32-NEXT: slli a0, a0, 3 -; ZVFHMIN32-NEXT: sub sp, sp, a0 -; ZVFHMIN32-NEXT: andi sp, sp, -32 -; ZVFHMIN32-NEXT: addi a0, sp, 32 -; ZVFHMIN32-NEXT: vs4r.v v8, (a0) -; ZVFHMIN32-NEXT: csrr a1, vlenb -; ZVFHMIN32-NEXT: slli a1, a1, 2 -; ZVFHMIN32-NEXT: add a0, a0, a1 -; ZVFHMIN32-NEXT: vs4r.v v12, (a0) -; ZVFHMIN32-NEXT: addi a0, a0, -2 -; ZVFHMIN32-NEXT: vl4re16.v v8, (a0) -; ZVFHMIN32-NEXT: addi sp, s0, -48 -; ZVFHMIN32-NEXT: ld ra, 40(sp) # 8-byte Folded Reload -; ZVFHMIN32-NEXT: ld s0, 32(sp) # 8-byte Folded Reload -; ZVFHMIN32-NEXT: addi sp, sp, 48 -; ZVFHMIN32-NEXT: ret -; -; ZVFH32-LABEL: splice_nxv16f16_offset_negone: -; ZVFH32: # %bb.0: -; ZVFH32-NEXT: csrr a0, vlenb -; ZVFH32-NEXT: slli a0, a0, 1 -; ZVFH32-NEXT: addi a0, a0, -1 -; ZVFH32-NEXT: vsetivli zero, 1, e16, m4, ta, ma -; ZVFH32-NEXT: vslidedown.vx v8, v8, a0 -; ZVFH32-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFH32-NEXT: vslideup.vi v8, v12, 1 -; ZVFH32-NEXT: ret -; -; ZVFH64-LABEL: splice_nxv16f16_offset_negone: -; ZVFH64: # %bb.0: -; ZVFH64-NEXT: csrr a0, vlenb -; ZVFH64-NEXT: slli a0, a0, 1 -; ZVFH64-NEXT: addi a0, a0, -1 -; ZVFH64-NEXT: vsetivli zero, 1, e16, m4, ta, ma -; ZVFH64-NEXT: vslidedown.vx v8, v8, a0 -; ZVFH64-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFH64-NEXT: vslideup.vi v8, v12, 1 -; ZVFH64-NEXT: ret +; CHECK-LABEL: splice_nxv16f16_offset_negone: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetivli zero, 1, e16, m4, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vslideup.vi v8, v12, 1 +; CHECK-NEXT: ret %res = call @llvm.vector.splice.nxv16f16( %a, %b, i32 -1) ret %res } define @splice_nxv16f16_offset_min( %a, %b) #0 { -; ZVFHMIN64-LABEL: splice_nxv16f16_offset_min: -; ZVFHMIN64: # %bb.0: -; ZVFHMIN64-NEXT: addi sp, sp, -48 -; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 48 -; ZVFHMIN64-NEXT: sw ra, 44(sp) # 4-byte Folded Spill -; ZVFHMIN64-NEXT: sw s0, 40(sp) # 4-byte Folded Spill -; ZVFHMIN64-NEXT: .cfi_offset ra, -4 -; ZVFHMIN64-NEXT: .cfi_offset s0, -8 -; ZVFHMIN64-NEXT: addi s0, sp, 48 -; ZVFHMIN64-NEXT: .cfi_def_cfa s0, 0 -; ZVFHMIN64-NEXT: csrr a0, vlenb -; ZVFHMIN64-NEXT: slli a0, a0, 3 -; ZVFHMIN64-NEXT: sub sp, sp, a0 -; ZVFHMIN64-NEXT: andi sp, sp, -32 -; ZVFHMIN64-NEXT: addi a0, sp, 32 -; ZVFHMIN64-NEXT: vs4r.v v8, (a0) -; ZVFHMIN64-NEXT: csrr a1, vlenb -; ZVFHMIN64-NEXT: slli a1, a1, 2 -; ZVFHMIN64-NEXT: add a0, a0, a1 -; ZVFHMIN64-NEXT: li a2, 64 -; ZVFHMIN64-NEXT: vs4r.v v12, (a0) -; ZVFHMIN64-NEXT: bltu a1, a2, .LBB144_2 -; ZVFHMIN64-NEXT: # %bb.1: -; ZVFHMIN64-NEXT: li a1, 64 -; ZVFHMIN64-NEXT: .LBB144_2: -; ZVFHMIN64-NEXT: sub a0, a0, a1 -; ZVFHMIN64-NEXT: vl4re16.v v8, (a0) -; ZVFHMIN64-NEXT: addi sp, s0, -48 -; ZVFHMIN64-NEXT: lw ra, 44(sp) # 4-byte Folded Reload -; ZVFHMIN64-NEXT: lw s0, 40(sp) # 4-byte Folded Reload -; ZVFHMIN64-NEXT: addi sp, sp, 48 -; ZVFHMIN64-NEXT: ret -; -; ZVFHMIN32-LABEL: splice_nxv16f16_offset_min: -; ZVFHMIN32: # %bb.0: -; ZVFHMIN32-NEXT: addi sp, sp, -48 -; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 48 -; ZVFHMIN32-NEXT: sd ra, 40(sp) # 8-byte Folded Spill -; ZVFHMIN32-NEXT: sd s0, 32(sp) # 8-byte Folded Spill -; ZVFHMIN32-NEXT: .cfi_offset ra, -8 -; ZVFHMIN32-NEXT: .cfi_offset s0, -16 -; ZVFHMIN32-NEXT: addi s0, sp, 48 -; ZVFHMIN32-NEXT: .cfi_def_cfa s0, 0 -; ZVFHMIN32-NEXT: csrr a0, vlenb -; ZVFHMIN32-NEXT: slli a0, a0, 3 -; ZVFHMIN32-NEXT: sub sp, sp, a0 -; ZVFHMIN32-NEXT: andi sp, sp, -32 -; ZVFHMIN32-NEXT: addi a0, sp, 32 -; ZVFHMIN32-NEXT: vs4r.v v8, (a0) -; ZVFHMIN32-NEXT: csrr a1, vlenb -; ZVFHMIN32-NEXT: slli a1, a1, 2 -; ZVFHMIN32-NEXT: add a0, a0, a1 -; ZVFHMIN32-NEXT: li a2, 64 -; ZVFHMIN32-NEXT: vs4r.v v12, (a0) -; ZVFHMIN32-NEXT: bltu a1, a2, .LBB144_2 -; ZVFHMIN32-NEXT: # %bb.1: -; ZVFHMIN32-NEXT: li a1, 64 -; ZVFHMIN32-NEXT: .LBB144_2: -; ZVFHMIN32-NEXT: sub a0, a0, a1 -; ZVFHMIN32-NEXT: vl4re16.v v8, (a0) -; ZVFHMIN32-NEXT: addi sp, s0, -48 -; ZVFHMIN32-NEXT: ld ra, 40(sp) # 8-byte Folded Reload -; ZVFHMIN32-NEXT: ld s0, 32(sp) # 8-byte Folded Reload -; ZVFHMIN32-NEXT: addi sp, sp, 48 -; ZVFHMIN32-NEXT: ret -; -; ZVFH32-LABEL: splice_nxv16f16_offset_min: -; ZVFH32: # %bb.0: -; ZVFH32-NEXT: csrr a0, vlenb -; ZVFH32-NEXT: slli a0, a0, 1 -; ZVFH32-NEXT: addi a0, a0, -32 -; ZVFH32-NEXT: li a1, 32 -; ZVFH32-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; ZVFH32-NEXT: vslidedown.vx v8, v8, a0 -; ZVFH32-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFH32-NEXT: vslideup.vx v8, v12, a1 -; ZVFH32-NEXT: ret -; -; ZVFH64-LABEL: splice_nxv16f16_offset_min: -; ZVFH64: # %bb.0: -; ZVFH64-NEXT: csrr a0, vlenb -; ZVFH64-NEXT: slli a0, a0, 1 -; ZVFH64-NEXT: addi a0, a0, -32 -; ZVFH64-NEXT: li a1, 32 -; ZVFH64-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; ZVFH64-NEXT: vslidedown.vx v8, v8, a0 -; ZVFH64-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFH64-NEXT: vslideup.vx v8, v12, a1 -; ZVFH64-NEXT: ret +; CHECK-LABEL: splice_nxv16f16_offset_min: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: addi a0, a0, -32 +; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vslideup.vx v8, v12, a1 +; CHECK-NEXT: ret %res = call @llvm.vector.splice.nxv16f16( %a, %b, i32 -32) ret %res } define @splice_nxv16f16_offset_max( %a, %b) #0 { -; ZVFHMIN64-LABEL: splice_nxv16f16_offset_max: -; ZVFHMIN64: # %bb.0: -; ZVFHMIN64-NEXT: addi sp, sp, -48 -; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 48 -; ZVFHMIN64-NEXT: sw ra, 44(sp) # 4-byte Folded Spill -; ZVFHMIN64-NEXT: sw s0, 40(sp) # 4-byte Folded Spill -; ZVFHMIN64-NEXT: .cfi_offset ra, -4 -; ZVFHMIN64-NEXT: .cfi_offset s0, -8 -; ZVFHMIN64-NEXT: addi s0, sp, 48 -; ZVFHMIN64-NEXT: .cfi_def_cfa s0, 0 -; ZVFHMIN64-NEXT: csrr a0, vlenb -; ZVFHMIN64-NEXT: slli a0, a0, 3 -; ZVFHMIN64-NEXT: sub sp, sp, a0 -; ZVFHMIN64-NEXT: andi sp, sp, -32 -; ZVFHMIN64-NEXT: addi a0, sp, 32 -; ZVFHMIN64-NEXT: vs4r.v v8, (a0) -; ZVFHMIN64-NEXT: csrr a1, vlenb -; ZVFHMIN64-NEXT: slli a2, a1, 2 -; ZVFHMIN64-NEXT: add a2, a0, a2 -; ZVFHMIN64-NEXT: slli a1, a1, 1 -; ZVFHMIN64-NEXT: addi a1, a1, -1 -; ZVFHMIN64-NEXT: li a3, 31 -; ZVFHMIN64-NEXT: vs4r.v v12, (a2) -; ZVFHMIN64-NEXT: bltu a1, a3, .LBB145_2 -; ZVFHMIN64-NEXT: # %bb.1: -; ZVFHMIN64-NEXT: li a1, 31 -; ZVFHMIN64-NEXT: .LBB145_2: -; ZVFHMIN64-NEXT: slli a1, a1, 1 -; ZVFHMIN64-NEXT: add a0, a0, a1 -; ZVFHMIN64-NEXT: vl4re16.v v8, (a0) -; ZVFHMIN64-NEXT: addi sp, s0, -48 -; ZVFHMIN64-NEXT: lw ra, 44(sp) # 4-byte Folded Reload -; ZVFHMIN64-NEXT: lw s0, 40(sp) # 4-byte Folded Reload -; ZVFHMIN64-NEXT: addi sp, sp, 48 -; ZVFHMIN64-NEXT: ret -; -; ZVFHMIN32-LABEL: splice_nxv16f16_offset_max: -; ZVFHMIN32: # %bb.0: -; ZVFHMIN32-NEXT: addi sp, sp, -48 -; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 48 -; ZVFHMIN32-NEXT: sd ra, 40(sp) # 8-byte Folded Spill -; ZVFHMIN32-NEXT: sd s0, 32(sp) # 8-byte Folded Spill -; ZVFHMIN32-NEXT: .cfi_offset ra, -8 -; ZVFHMIN32-NEXT: .cfi_offset s0, -16 -; ZVFHMIN32-NEXT: addi s0, sp, 48 -; ZVFHMIN32-NEXT: .cfi_def_cfa s0, 0 -; ZVFHMIN32-NEXT: csrr a0, vlenb -; ZVFHMIN32-NEXT: slli a0, a0, 3 -; ZVFHMIN32-NEXT: sub sp, sp, a0 -; ZVFHMIN32-NEXT: andi sp, sp, -32 -; ZVFHMIN32-NEXT: addi a0, sp, 32 -; ZVFHMIN32-NEXT: vs4r.v v8, (a0) -; ZVFHMIN32-NEXT: csrr a1, vlenb -; ZVFHMIN32-NEXT: slli a2, a1, 2 -; ZVFHMIN32-NEXT: add a2, a0, a2 -; ZVFHMIN32-NEXT: slli a1, a1, 1 -; ZVFHMIN32-NEXT: addi a1, a1, -1 -; ZVFHMIN32-NEXT: li a3, 31 -; ZVFHMIN32-NEXT: vs4r.v v12, (a2) -; ZVFHMIN32-NEXT: bltu a1, a3, .LBB145_2 -; ZVFHMIN32-NEXT: # %bb.1: -; ZVFHMIN32-NEXT: li a1, 31 -; ZVFHMIN32-NEXT: .LBB145_2: -; ZVFHMIN32-NEXT: slli a1, a1, 1 -; ZVFHMIN32-NEXT: add a0, a0, a1 -; ZVFHMIN32-NEXT: vl4re16.v v8, (a0) -; ZVFHMIN32-NEXT: addi sp, s0, -48 -; ZVFHMIN32-NEXT: ld ra, 40(sp) # 8-byte Folded Reload -; ZVFHMIN32-NEXT: ld s0, 32(sp) # 8-byte Folded Reload -; ZVFHMIN32-NEXT: addi sp, sp, 48 -; ZVFHMIN32-NEXT: ret -; -; ZVFH32-LABEL: splice_nxv16f16_offset_max: -; ZVFH32: # %bb.0: -; ZVFH32-NEXT: csrr a0, vlenb -; ZVFH32-NEXT: slli a0, a0, 1 -; ZVFH32-NEXT: addi a0, a0, -31 -; ZVFH32-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; ZVFH32-NEXT: vslidedown.vi v8, v8, 31 -; ZVFH32-NEXT: vsetvli a1, zero, e16, m4, ta, ma -; ZVFH32-NEXT: vslideup.vx v8, v12, a0 -; ZVFH32-NEXT: ret -; -; ZVFH64-LABEL: splice_nxv16f16_offset_max: -; ZVFH64: # %bb.0: -; ZVFH64-NEXT: csrr a0, vlenb -; ZVFH64-NEXT: slli a0, a0, 1 -; ZVFH64-NEXT: addi a0, a0, -31 -; ZVFH64-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; ZVFH64-NEXT: vslidedown.vi v8, v8, 31 -; ZVFH64-NEXT: vsetvli a1, zero, e16, m4, ta, ma -; ZVFH64-NEXT: vslideup.vx v8, v12, a0 -; ZVFH64-NEXT: ret +; CHECK-LABEL: splice_nxv16f16_offset_max: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: addi a0, a0, -31 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vslidedown.vi v8, v8, 31 +; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma +; CHECK-NEXT: vslideup.vx v8, v12, a0 +; CHECK-NEXT: ret %res = call @llvm.vector.splice.nxv16f16( %a, %b, i32 31) ret %res } @@ -3996,277 +2143,48 @@ define @splice_nxv32f16_offset_zero( %a } define @splice_nxv32f16_offset_negone( %a, %b) #0 { -; ZVFHMIN64-LABEL: splice_nxv32f16_offset_negone: -; ZVFHMIN64: # %bb.0: -; ZVFHMIN64-NEXT: addi sp, sp, -80 -; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 80 -; ZVFHMIN64-NEXT: sw ra, 76(sp) # 4-byte Folded Spill -; ZVFHMIN64-NEXT: sw s0, 72(sp) # 4-byte Folded Spill -; ZVFHMIN64-NEXT: .cfi_offset ra, -4 -; ZVFHMIN64-NEXT: .cfi_offset s0, -8 -; ZVFHMIN64-NEXT: addi s0, sp, 80 -; ZVFHMIN64-NEXT: .cfi_def_cfa s0, 0 -; ZVFHMIN64-NEXT: csrr a0, vlenb -; ZVFHMIN64-NEXT: slli a0, a0, 4 -; ZVFHMIN64-NEXT: sub sp, sp, a0 -; ZVFHMIN64-NEXT: andi sp, sp, -64 -; ZVFHMIN64-NEXT: addi a0, sp, 64 -; ZVFHMIN64-NEXT: vs8r.v v8, (a0) -; ZVFHMIN64-NEXT: csrr a1, vlenb -; ZVFHMIN64-NEXT: slli a1, a1, 3 -; ZVFHMIN64-NEXT: add a0, a0, a1 -; ZVFHMIN64-NEXT: vs8r.v v16, (a0) -; ZVFHMIN64-NEXT: addi a0, a0, -2 -; ZVFHMIN64-NEXT: vl8re16.v v8, (a0) -; ZVFHMIN64-NEXT: addi sp, s0, -80 -; ZVFHMIN64-NEXT: lw ra, 76(sp) # 4-byte Folded Reload -; ZVFHMIN64-NEXT: lw s0, 72(sp) # 4-byte Folded Reload -; ZVFHMIN64-NEXT: addi sp, sp, 80 -; ZVFHMIN64-NEXT: ret -; -; ZVFHMIN32-LABEL: splice_nxv32f16_offset_negone: -; ZVFHMIN32: # %bb.0: -; ZVFHMIN32-NEXT: addi sp, sp, -80 -; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 80 -; ZVFHMIN32-NEXT: sd ra, 72(sp) # 8-byte Folded Spill -; ZVFHMIN32-NEXT: sd s0, 64(sp) # 8-byte Folded Spill -; ZVFHMIN32-NEXT: .cfi_offset ra, -8 -; ZVFHMIN32-NEXT: .cfi_offset s0, -16 -; ZVFHMIN32-NEXT: addi s0, sp, 80 -; ZVFHMIN32-NEXT: .cfi_def_cfa s0, 0 -; ZVFHMIN32-NEXT: csrr a0, vlenb -; ZVFHMIN32-NEXT: slli a0, a0, 4 -; ZVFHMIN32-NEXT: sub sp, sp, a0 -; ZVFHMIN32-NEXT: andi sp, sp, -64 -; ZVFHMIN32-NEXT: addi a0, sp, 64 -; ZVFHMIN32-NEXT: vs8r.v v8, (a0) -; ZVFHMIN32-NEXT: csrr a1, vlenb -; ZVFHMIN32-NEXT: slli a1, a1, 3 -; ZVFHMIN32-NEXT: add a0, a0, a1 -; ZVFHMIN32-NEXT: vs8r.v v16, (a0) -; ZVFHMIN32-NEXT: addi a0, a0, -2 -; ZVFHMIN32-NEXT: vl8re16.v v8, (a0) -; ZVFHMIN32-NEXT: addi sp, s0, -80 -; ZVFHMIN32-NEXT: ld ra, 72(sp) # 8-byte Folded Reload -; ZVFHMIN32-NEXT: ld s0, 64(sp) # 8-byte Folded Reload -; ZVFHMIN32-NEXT: addi sp, sp, 80 -; ZVFHMIN32-NEXT: ret -; -; ZVFH32-LABEL: splice_nxv32f16_offset_negone: -; ZVFH32: # %bb.0: -; ZVFH32-NEXT: csrr a0, vlenb -; ZVFH32-NEXT: slli a0, a0, 2 -; ZVFH32-NEXT: addi a0, a0, -1 -; ZVFH32-NEXT: vsetivli zero, 1, e16, m8, ta, ma -; ZVFH32-NEXT: vslidedown.vx v8, v8, a0 -; ZVFH32-NEXT: vsetvli a0, zero, e16, m8, ta, ma -; ZVFH32-NEXT: vslideup.vi v8, v16, 1 -; ZVFH32-NEXT: ret -; -; ZVFH64-LABEL: splice_nxv32f16_offset_negone: -; ZVFH64: # %bb.0: -; ZVFH64-NEXT: csrr a0, vlenb -; ZVFH64-NEXT: slli a0, a0, 2 -; ZVFH64-NEXT: addi a0, a0, -1 -; ZVFH64-NEXT: vsetivli zero, 1, e16, m8, ta, ma -; ZVFH64-NEXT: vslidedown.vx v8, v8, a0 -; ZVFH64-NEXT: vsetvli a0, zero, e16, m8, ta, ma -; ZVFH64-NEXT: vslideup.vi v8, v16, 1 -; ZVFH64-NEXT: ret +; CHECK-LABEL: splice_nxv32f16_offset_negone: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vsetivli zero, 1, e16, m8, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, ma +; CHECK-NEXT: vslideup.vi v8, v16, 1 +; CHECK-NEXT: ret %res = call @llvm.vector.splice.nxv32f16( %a, %b, i32 -1) ret %res } define @splice_nxv32f16_offset_min( %a, %b) #0 { -; ZVFHMIN64-LABEL: splice_nxv32f16_offset_min: -; ZVFHMIN64: # %bb.0: -; ZVFHMIN64-NEXT: addi sp, sp, -80 -; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 80 -; ZVFHMIN64-NEXT: sw ra, 76(sp) # 4-byte Folded Spill -; ZVFHMIN64-NEXT: sw s0, 72(sp) # 4-byte Folded Spill -; ZVFHMIN64-NEXT: .cfi_offset ra, -4 -; ZVFHMIN64-NEXT: .cfi_offset s0, -8 -; ZVFHMIN64-NEXT: addi s0, sp, 80 -; ZVFHMIN64-NEXT: .cfi_def_cfa s0, 0 -; ZVFHMIN64-NEXT: csrr a0, vlenb -; ZVFHMIN64-NEXT: slli a0, a0, 4 -; ZVFHMIN64-NEXT: sub sp, sp, a0 -; ZVFHMIN64-NEXT: andi sp, sp, -64 -; ZVFHMIN64-NEXT: addi a0, sp, 64 -; ZVFHMIN64-NEXT: vs8r.v v8, (a0) -; ZVFHMIN64-NEXT: csrr a1, vlenb -; ZVFHMIN64-NEXT: slli a1, a1, 3 -; ZVFHMIN64-NEXT: add a0, a0, a1 -; ZVFHMIN64-NEXT: li a2, 128 -; ZVFHMIN64-NEXT: vs8r.v v16, (a0) -; ZVFHMIN64-NEXT: bltu a1, a2, .LBB148_2 -; ZVFHMIN64-NEXT: # %bb.1: -; ZVFHMIN64-NEXT: li a1, 128 -; ZVFHMIN64-NEXT: .LBB148_2: -; ZVFHMIN64-NEXT: sub a0, a0, a1 -; ZVFHMIN64-NEXT: vl8re16.v v8, (a0) -; ZVFHMIN64-NEXT: addi sp, s0, -80 -; ZVFHMIN64-NEXT: lw ra, 76(sp) # 4-byte Folded Reload -; ZVFHMIN64-NEXT: lw s0, 72(sp) # 4-byte Folded Reload -; ZVFHMIN64-NEXT: addi sp, sp, 80 -; ZVFHMIN64-NEXT: ret -; -; ZVFHMIN32-LABEL: splice_nxv32f16_offset_min: -; ZVFHMIN32: # %bb.0: -; ZVFHMIN32-NEXT: addi sp, sp, -80 -; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 80 -; ZVFHMIN32-NEXT: sd ra, 72(sp) # 8-byte Folded Spill -; ZVFHMIN32-NEXT: sd s0, 64(sp) # 8-byte Folded Spill -; ZVFHMIN32-NEXT: .cfi_offset ra, -8 -; ZVFHMIN32-NEXT: .cfi_offset s0, -16 -; ZVFHMIN32-NEXT: addi s0, sp, 80 -; ZVFHMIN32-NEXT: .cfi_def_cfa s0, 0 -; ZVFHMIN32-NEXT: csrr a0, vlenb -; ZVFHMIN32-NEXT: slli a0, a0, 4 -; ZVFHMIN32-NEXT: sub sp, sp, a0 -; ZVFHMIN32-NEXT: andi sp, sp, -64 -; ZVFHMIN32-NEXT: addi a0, sp, 64 -; ZVFHMIN32-NEXT: vs8r.v v8, (a0) -; ZVFHMIN32-NEXT: csrr a1, vlenb -; ZVFHMIN32-NEXT: slli a1, a1, 3 -; ZVFHMIN32-NEXT: add a0, a0, a1 -; ZVFHMIN32-NEXT: li a2, 128 -; ZVFHMIN32-NEXT: vs8r.v v16, (a0) -; ZVFHMIN32-NEXT: bltu a1, a2, .LBB148_2 -; ZVFHMIN32-NEXT: # %bb.1: -; ZVFHMIN32-NEXT: li a1, 128 -; ZVFHMIN32-NEXT: .LBB148_2: -; ZVFHMIN32-NEXT: sub a0, a0, a1 -; ZVFHMIN32-NEXT: vl8re16.v v8, (a0) -; ZVFHMIN32-NEXT: addi sp, s0, -80 -; ZVFHMIN32-NEXT: ld ra, 72(sp) # 8-byte Folded Reload -; ZVFHMIN32-NEXT: ld s0, 64(sp) # 8-byte Folded Reload -; ZVFHMIN32-NEXT: addi sp, sp, 80 -; ZVFHMIN32-NEXT: ret -; -; ZVFH32-LABEL: splice_nxv32f16_offset_min: -; ZVFH32: # %bb.0: -; ZVFH32-NEXT: csrr a0, vlenb -; ZVFH32-NEXT: slli a0, a0, 2 -; ZVFH32-NEXT: addi a0, a0, -64 -; ZVFH32-NEXT: li a1, 64 -; ZVFH32-NEXT: vsetvli zero, a1, e16, m8, ta, ma -; ZVFH32-NEXT: vslidedown.vx v8, v8, a0 -; ZVFH32-NEXT: vsetvli a0, zero, e16, m8, ta, ma -; ZVFH32-NEXT: vslideup.vx v8, v16, a1 -; ZVFH32-NEXT: ret -; -; ZVFH64-LABEL: splice_nxv32f16_offset_min: -; ZVFH64: # %bb.0: -; ZVFH64-NEXT: csrr a0, vlenb -; ZVFH64-NEXT: slli a0, a0, 2 -; ZVFH64-NEXT: addi a0, a0, -64 -; ZVFH64-NEXT: li a1, 64 -; ZVFH64-NEXT: vsetvli zero, a1, e16, m8, ta, ma -; ZVFH64-NEXT: vslidedown.vx v8, v8, a0 -; ZVFH64-NEXT: vsetvli a0, zero, e16, m8, ta, ma -; ZVFH64-NEXT: vslideup.vx v8, v16, a1 -; ZVFH64-NEXT: ret +; CHECK-LABEL: splice_nxv32f16_offset_min: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: addi a0, a0, -64 +; CHECK-NEXT: li a1, 64 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, ma +; CHECK-NEXT: vslideup.vx v8, v16, a1 +; CHECK-NEXT: ret %res = call @llvm.vector.splice.nxv32f16( %a, %b, i32 -64) ret %res } define @splice_nxv32f16_offset_max( %a, %b) #0 { -; ZVFHMIN64-LABEL: splice_nxv32f16_offset_max: -; ZVFHMIN64: # %bb.0: -; ZVFHMIN64-NEXT: addi sp, sp, -80 -; ZVFHMIN64-NEXT: .cfi_def_cfa_offset 80 -; ZVFHMIN64-NEXT: sw ra, 76(sp) # 4-byte Folded Spill -; ZVFHMIN64-NEXT: sw s0, 72(sp) # 4-byte Folded Spill -; ZVFHMIN64-NEXT: .cfi_offset ra, -4 -; ZVFHMIN64-NEXT: .cfi_offset s0, -8 -; ZVFHMIN64-NEXT: addi s0, sp, 80 -; ZVFHMIN64-NEXT: .cfi_def_cfa s0, 0 -; ZVFHMIN64-NEXT: csrr a0, vlenb -; ZVFHMIN64-NEXT: slli a0, a0, 4 -; ZVFHMIN64-NEXT: sub sp, sp, a0 -; ZVFHMIN64-NEXT: andi sp, sp, -64 -; ZVFHMIN64-NEXT: addi a0, sp, 64 -; ZVFHMIN64-NEXT: vs8r.v v8, (a0) -; ZVFHMIN64-NEXT: csrr a1, vlenb -; ZVFHMIN64-NEXT: slli a2, a1, 3 -; ZVFHMIN64-NEXT: add a2, a0, a2 -; ZVFHMIN64-NEXT: slli a1, a1, 2 -; ZVFHMIN64-NEXT: addi a1, a1, -1 -; ZVFHMIN64-NEXT: li a3, 63 -; ZVFHMIN64-NEXT: vs8r.v v16, (a2) -; ZVFHMIN64-NEXT: bltu a1, a3, .LBB149_2 -; ZVFHMIN64-NEXT: # %bb.1: -; ZVFHMIN64-NEXT: li a1, 63 -; ZVFHMIN64-NEXT: .LBB149_2: -; ZVFHMIN64-NEXT: slli a1, a1, 1 -; ZVFHMIN64-NEXT: add a0, a0, a1 -; ZVFHMIN64-NEXT: vl8re16.v v8, (a0) -; ZVFHMIN64-NEXT: addi sp, s0, -80 -; ZVFHMIN64-NEXT: lw ra, 76(sp) # 4-byte Folded Reload -; ZVFHMIN64-NEXT: lw s0, 72(sp) # 4-byte Folded Reload -; ZVFHMIN64-NEXT: addi sp, sp, 80 -; ZVFHMIN64-NEXT: ret -; -; ZVFHMIN32-LABEL: splice_nxv32f16_offset_max: -; ZVFHMIN32: # %bb.0: -; ZVFHMIN32-NEXT: addi sp, sp, -80 -; ZVFHMIN32-NEXT: .cfi_def_cfa_offset 80 -; ZVFHMIN32-NEXT: sd ra, 72(sp) # 8-byte Folded Spill -; ZVFHMIN32-NEXT: sd s0, 64(sp) # 8-byte Folded Spill -; ZVFHMIN32-NEXT: .cfi_offset ra, -8 -; ZVFHMIN32-NEXT: .cfi_offset s0, -16 -; ZVFHMIN32-NEXT: addi s0, sp, 80 -; ZVFHMIN32-NEXT: .cfi_def_cfa s0, 0 -; ZVFHMIN32-NEXT: csrr a0, vlenb -; ZVFHMIN32-NEXT: slli a0, a0, 4 -; ZVFHMIN32-NEXT: sub sp, sp, a0 -; ZVFHMIN32-NEXT: andi sp, sp, -64 -; ZVFHMIN32-NEXT: addi a0, sp, 64 -; ZVFHMIN32-NEXT: vs8r.v v8, (a0) -; ZVFHMIN32-NEXT: csrr a1, vlenb -; ZVFHMIN32-NEXT: slli a2, a1, 3 -; ZVFHMIN32-NEXT: add a2, a0, a2 -; ZVFHMIN32-NEXT: slli a1, a1, 2 -; ZVFHMIN32-NEXT: addi a1, a1, -1 -; ZVFHMIN32-NEXT: li a3, 63 -; ZVFHMIN32-NEXT: vs8r.v v16, (a2) -; ZVFHMIN32-NEXT: bltu a1, a3, .LBB149_2 -; ZVFHMIN32-NEXT: # %bb.1: -; ZVFHMIN32-NEXT: li a1, 63 -; ZVFHMIN32-NEXT: .LBB149_2: -; ZVFHMIN32-NEXT: slli a1, a1, 1 -; ZVFHMIN32-NEXT: add a0, a0, a1 -; ZVFHMIN32-NEXT: vl8re16.v v8, (a0) -; ZVFHMIN32-NEXT: addi sp, s0, -80 -; ZVFHMIN32-NEXT: ld ra, 72(sp) # 8-byte Folded Reload -; ZVFHMIN32-NEXT: ld s0, 64(sp) # 8-byte Folded Reload -; ZVFHMIN32-NEXT: addi sp, sp, 80 -; ZVFHMIN32-NEXT: ret -; -; ZVFH32-LABEL: splice_nxv32f16_offset_max: -; ZVFH32: # %bb.0: -; ZVFH32-NEXT: csrr a0, vlenb -; ZVFH32-NEXT: slli a0, a0, 2 -; ZVFH32-NEXT: addi a0, a0, -63 -; ZVFH32-NEXT: li a1, 63 -; ZVFH32-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; ZVFH32-NEXT: vslidedown.vx v8, v8, a1 -; ZVFH32-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; ZVFH32-NEXT: vslideup.vx v8, v16, a0 -; ZVFH32-NEXT: ret -; -; ZVFH64-LABEL: splice_nxv32f16_offset_max: -; ZVFH64: # %bb.0: -; ZVFH64-NEXT: csrr a0, vlenb -; ZVFH64-NEXT: slli a0, a0, 2 -; ZVFH64-NEXT: addi a0, a0, -63 -; ZVFH64-NEXT: li a1, 63 -; ZVFH64-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; ZVFH64-NEXT: vslidedown.vx v8, v8, a1 -; ZVFH64-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; ZVFH64-NEXT: vslideup.vx v8, v16, a0 -; ZVFH64-NEXT: ret +; CHECK-LABEL: splice_nxv32f16_offset_max: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: addi a0, a0, -63 +; CHECK-NEXT: li a1, 63 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v8, a1 +; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma +; CHECK-NEXT: vslideup.vx v8, v16, a0 +; CHECK-NEXT: ret %res = call @llvm.vector.splice.nxv32f16( %a, %b, i32 63) ret %res } From 6fb6675587e50c1b37329f7808d3450984f5ce88 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Wed, 16 Oct 2024 17:50:24 +0100 Subject: [PATCH 3/3] Remove redundant --check-prefixes --- llvm/test/CodeGen/RISCV/rvv/vector-splice.ll | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll b/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll index c9cb6dc6397c3..5460caea196cf 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple riscv32 -mattr=+m,+f,+d,+v,+zvfhmin,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK -; RUN: llc -mtriple riscv64 -mattr=+m,+f,+d,+v,+zvfhmin,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK -; RUN: llc -mtriple riscv32 -mattr=+m,+f,+d,+v,+zvfh,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK -; RUN: llc -mtriple riscv64 -mattr=+m,+f,+d,+v,+zvfh,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK +; RUN: llc -mtriple riscv32 -mattr=+m,+f,+d,+v,+zvfhmin,+zvfbfmin < %s | FileCheck %s +; RUN: llc -mtriple riscv64 -mattr=+m,+f,+d,+v,+zvfhmin,+zvfbfmin < %s | FileCheck %s +; RUN: llc -mtriple riscv32 -mattr=+m,+f,+d,+v,+zvfh,+zvfbfmin < %s | FileCheck %s +; RUN: llc -mtriple riscv64 -mattr=+m,+f,+d,+v,+zvfh,+zvfbfmin < %s | FileCheck %s ; Tests assume VLEN=128 or vscale_range_min=2.