diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 967a6cf82433f..cc4ad13395e61 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -2250,6 +2250,10 @@ bool RISCVTargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) return false; + // Extracts from index 0 are just subreg extracts. + if (Index == 0) + return true; + // Only support extracting a fixed from a fixed vector for now. if (ResVT.isScalableVector() || SrcVT.isScalableVector()) return false; diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll index 8dfa79a0f1596..f6bdd45330384 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll @@ -183,461 +183,452 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 96 +; RV32-NEXT: li a3, 100 ; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: sub sp, sp, a2 -; RV32-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xe0, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 96 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xe4, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 100 * vlenb ; RV32-NEXT: addi a4, a1, 128 ; RV32-NEXT: addi a5, a1, 256 ; RV32-NEXT: li a2, 32 ; RV32-NEXT: lui a3, 12 +; RV32-NEXT: lui a6, 12291 +; RV32-NEXT: lui a7, %hi(.LCPI8_0) +; RV32-NEXT: addi a7, a7, %lo(.LCPI8_0) ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vle32.v v16, (a5) -; RV32-NEXT: lui a5, 12291 -; RV32-NEXT: vmv.s.x v3, a3 -; RV32-NEXT: vle32.v v24, (a1) +; RV32-NEXT: vle32.v v24, (a5) +; RV32-NEXT: vmv.s.x v0, a3 +; RV32-NEXT: vle32.v v8, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a6, 80 -; RV32-NEXT: mul a1, a1, a6 +; RV32-NEXT: slli a1, a1, 6 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill +; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill +; RV32-NEXT: addi a6, a6, 3 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vslideup.vi v8, v16, 4 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a6, 56 -; RV32-NEXT: mul a1, a1, a6 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill +; RV32-NEXT: vslideup.vi v16, v24, 4 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV32-NEXT: vslidedown.vi v16, v16, 16 +; RV32-NEXT: vslidedown.vi v8, v24, 16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a6, 88 -; RV32-NEXT: mul a1, a1, a6 +; RV32-NEXT: li a5, 76 +; RV32-NEXT: mul a1, a1, a5 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill -; RV32-NEXT: vmv1r.v v0, v3 +; RV32-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a6, 88 -; RV32-NEXT: mul a1, a1, a6 +; RV32-NEXT: li a5, 92 +; RV32-NEXT: mul a1, a1, a5 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload +; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill +; RV32-NEXT: vmv1r.v v30, v0 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; RV32-NEXT: vslideup.vi v8, v16, 10, v0.t +; RV32-NEXT: vslideup.vi v16, v8, 10, v0.t ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a6, 68 -; RV32-NEXT: mul a1, a1, a6 +; RV32-NEXT: li a5, 72 +; RV32-NEXT: mul a1, a1, a5 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill +; RV32-NEXT: vs4r.v v16, (a1) # vscale x 32-byte Folded Spill ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vle32.v v8, (a4) -; RV32-NEXT: addi a5, a5, 3 -; RV32-NEXT: vmv.s.x v0, a5 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a4, 56 +; RV32-NEXT: li a4, 84 ; RV32-NEXT: mul a1, a1, a4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload +; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vslideup.vi v4, v16, 2 -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmerge.vvm v24, v8, v24, v0 -; RV32-NEXT: vmv1r.v v0, v3 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a4, 88 -; RV32-NEXT: mul a1, a1, a4 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; RV32-NEXT: vslideup.vi v4, v16, 8, v0.t +; RV32-NEXT: vle16.v v28, (a7) +; RV32-NEXT: vmv.s.x v0, a6 ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 6 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v4, (a1) # vscale x 32-byte Folded Spill -; RV32-NEXT: lui a1, %hi(.LCPI8_0) -; RV32-NEXT: addi a1, a1, %lo(.LCPI8_0) -; RV32-NEXT: lui a4, 49164 -; RV32-NEXT: lui a5, %hi(.LCPI8_1) -; RV32-NEXT: addi a5, a5, %lo(.LCPI8_1) -; RV32-NEXT: vle16.v v6, (a1) -; RV32-NEXT: addi a4, a4, 12 -; RV32-NEXT: vle16.v v4, (a5) -; RV32-NEXT: vmv.s.x v0, a4 -; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; RV32-NEXT: vrgatherei16.vv v16, v24, v6 +; RV32-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a4, 48 +; RV32-NEXT: li a4, 84 ; RV32-NEXT: mul a1, a1, a4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill -; RV32-NEXT: vmv8r.v v16, v8 +; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; RV32-NEXT: vmerge.vvm v16, v8, v16, v0 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vrgatherei16.vv v0, v16, v28 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a4, 72 +; RV32-NEXT: li a4, 52 ; RV32-NEXT: mul a1, a1, a4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill +; RV32-NEXT: vs8r.v v0, (a1) # vscale x 64-byte Folded Spill +; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; RV32-NEXT: vslideup.vi v8, v24, 2 +; RV32-NEXT: vmv1r.v v0, v30 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a4, 80 +; RV32-NEXT: li a4, 92 ; RV32-NEXT: mul a1, a1, a4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmerge.vvm v8, v16, v8, v0 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vrgatherei16.vv v16, v8, v4 +; RV32-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload +; RV32-NEXT: vslideup.vi v8, v16, 8, v0.t ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a4, 40 +; RV32-NEXT: li a4, 60 ; RV32-NEXT: mul a1, a1, a4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill -; RV32-NEXT: lui a5, 196656 -; RV32-NEXT: lui a1, %hi(.LCPI8_2) -; RV32-NEXT: addi a1, a1, %lo(.LCPI8_2) -; RV32-NEXT: lui a6, 3 -; RV32-NEXT: lui a7, 786624 -; RV32-NEXT: lui t0, 768 -; RV32-NEXT: li a4, 48 -; RV32-NEXT: addi a5, a5, 48 -; RV32-NEXT: vmv.s.x v0, a5 -; RV32-NEXT: addi a6, a6, 3 -; RV32-NEXT: csrr a5, vlenb -; RV32-NEXT: li t1, 80 -; RV32-NEXT: mul a5, a5, t1 -; RV32-NEXT: add a5, sp, a5 -; RV32-NEXT: addi a5, a5, 16 -; RV32-NEXT: vl8r.v v8, (a5) # vscale x 64-byte Folded Reload -; RV32-NEXT: csrr a5, vlenb -; RV32-NEXT: li t1, 72 -; RV32-NEXT: mul a5, a5, t1 -; RV32-NEXT: add a5, sp, a5 -; RV32-NEXT: addi a5, a5, 16 -; RV32-NEXT: vl8r.v v16, (a5) # vscale x 64-byte Folded Reload +; RV32-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill +; RV32-NEXT: lui a7, 49164 +; RV32-NEXT: lui a1, %hi(.LCPI8_1) +; RV32-NEXT: addi a1, a1, %lo(.LCPI8_1) +; RV32-NEXT: lui t2, 3 +; RV32-NEXT: lui t1, 196656 +; RV32-NEXT: lui a4, %hi(.LCPI8_3) +; RV32-NEXT: addi a4, a4, %lo(.LCPI8_3) +; RV32-NEXT: lui t0, 786624 +; RV32-NEXT: li a5, 48 +; RV32-NEXT: lui a6, 768 +; RV32-NEXT: addi a7, a7, 12 +; RV32-NEXT: vmv.s.x v0, a7 +; RV32-NEXT: addi t2, t2, 3 +; RV32-NEXT: csrr a7, vlenb +; RV32-NEXT: li t3, 84 +; RV32-NEXT: mul a7, a7, t3 +; RV32-NEXT: add a7, sp, a7 +; RV32-NEXT: addi a7, a7, 16 +; RV32-NEXT: vl8r.v v16, (a7) # vscale x 64-byte Folded Reload +; RV32-NEXT: csrr a7, vlenb +; RV32-NEXT: slli a7, a7, 6 +; RV32-NEXT: add a7, sp, a7 +; RV32-NEXT: addi a7, a7, 16 +; RV32-NEXT: vl8r.v v8, (a7) # vscale x 64-byte Folded Reload ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vmerge.vvm v8, v16, v8, v0 -; RV32-NEXT: csrr a5, vlenb -; RV32-NEXT: slli a5, a5, 3 -; RV32-NEXT: add a5, sp, a5 -; RV32-NEXT: addi a5, a5, 16 -; RV32-NEXT: vs8r.v v8, (a5) # vscale x 64-byte Folded Spill -; RV32-NEXT: vmv.s.x v0, a6 -; RV32-NEXT: addi a5, a7, 192 +; RV32-NEXT: csrr a7, vlenb +; RV32-NEXT: li t3, 36 +; RV32-NEXT: mul a7, a7, t3 +; RV32-NEXT: add a7, sp, a7 +; RV32-NEXT: addi a7, a7, 16 +; RV32-NEXT: vs8r.v v8, (a7) # vscale x 64-byte Folded Spill +; RV32-NEXT: vmv.s.x v0, t2 +; RV32-NEXT: addi a7, t1, 48 +; RV32-NEXT: csrr t1, vlenb +; RV32-NEXT: li t2, 92 +; RV32-NEXT: mul t1, t1, t2 +; RV32-NEXT: add t1, sp, t1 +; RV32-NEXT: addi t1, t1, 16 +; RV32-NEXT: vl8r.v v24, (t1) # vscale x 64-byte Folded Reload +; RV32-NEXT: csrr t1, vlenb +; RV32-NEXT: li t2, 76 +; RV32-NEXT: mul t1, t1, t2 +; RV32-NEXT: add t1, sp, t1 +; RV32-NEXT: addi t1, t1, 16 +; RV32-NEXT: vl8r.v v8, (t1) # vscale x 64-byte Folded Reload +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-NEXT: vmerge.vvm v8, v24, v8, v0 +; RV32-NEXT: addi t1, sp, 16 +; RV32-NEXT: vs4r.v v8, (t1) # vscale x 32-byte Folded Spill +; RV32-NEXT: vmv.s.x v0, a7 +; RV32-NEXT: addi a3, a3, 12 +; RV32-NEXT: csrr a7, vlenb +; RV32-NEXT: slli a7, a7, 6 +; RV32-NEXT: add a7, sp, a7 +; RV32-NEXT: addi a7, a7, 16 +; RV32-NEXT: vl8r.v v24, (a7) # vscale x 64-byte Folded Reload +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; RV32-NEXT: vmerge.vvm v8, v16, v24, v0 +; RV32-NEXT: csrr a7, vlenb +; RV32-NEXT: li t1, 20 +; RV32-NEXT: mul a7, a7, t1 +; RV32-NEXT: add a7, sp, a7 +; RV32-NEXT: addi a7, a7, 16 +; RV32-NEXT: vs8r.v v8, (a7) # vscale x 64-byte Folded Spill +; RV32-NEXT: vmv8r.v v16, v24 +; RV32-NEXT: vmv.s.x v0, a3 +; RV32-NEXT: addi a3, t0, 192 +; RV32-NEXT: csrr a7, vlenb +; RV32-NEXT: li t0, 92 +; RV32-NEXT: mul a7, a7, t0 +; RV32-NEXT: add a7, sp, a7 +; RV32-NEXT: addi a7, a7, 16 +; RV32-NEXT: vl8r.v v24, (a7) # vscale x 64-byte Folded Reload +; RV32-NEXT: csrr a7, vlenb +; RV32-NEXT: li t0, 76 +; RV32-NEXT: mul a7, a7, t0 +; RV32-NEXT: add a7, sp, a7 +; RV32-NEXT: addi a7, a7, 16 +; RV32-NEXT: vl8r.v v8, (a7) # vscale x 64-byte Folded Reload +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-NEXT: vmerge.vvm v8, v24, v8, v0 +; RV32-NEXT: csrr a7, vlenb +; RV32-NEXT: li t0, 48 +; RV32-NEXT: mul a7, a7, t0 +; RV32-NEXT: add a7, sp, a7 +; RV32-NEXT: addi a7, a7, 16 +; RV32-NEXT: vs4r.v v8, (a7) # vscale x 32-byte Folded Spill +; RV32-NEXT: vmv.s.x v0, a3 +; RV32-NEXT: li a3, 192 +; RV32-NEXT: csrr a7, vlenb +; RV32-NEXT: li t0, 84 +; RV32-NEXT: mul a7, a7, t0 +; RV32-NEXT: add a7, sp, a7 +; RV32-NEXT: addi a7, a7, 16 +; RV32-NEXT: vl8r.v v8, (a7) # vscale x 64-byte Folded Reload +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; RV32-NEXT: vmerge.vvm v8, v8, v16, v0 +; RV32-NEXT: csrr a7, vlenb +; RV32-NEXT: li t0, 28 +; RV32-NEXT: mul a7, a7, t0 +; RV32-NEXT: add a7, sp, a7 +; RV32-NEXT: addi a7, a7, 16 +; RV32-NEXT: vs8r.v v8, (a7) # vscale x 64-byte Folded Spill +; RV32-NEXT: vmv.s.x v0, a5 +; RV32-NEXT: addi a5, a6, 768 ; RV32-NEXT: csrr a6, vlenb -; RV32-NEXT: li a7, 88 +; RV32-NEXT: li a7, 92 ; RV32-NEXT: mul a6, a6, a7 ; RV32-NEXT: add a6, sp, a6 ; RV32-NEXT: addi a6, a6, 16 -; RV32-NEXT: vl8r.v v8, (a6) # vscale x 64-byte Folded Reload +; RV32-NEXT: vl8r.v v24, (a6) # vscale x 64-byte Folded Reload ; RV32-NEXT: csrr a6, vlenb -; RV32-NEXT: li a7, 56 +; RV32-NEXT: li a7, 76 ; RV32-NEXT: mul a6, a6, a7 ; RV32-NEXT: add a6, sp, a6 ; RV32-NEXT: addi a6, a6, 16 -; RV32-NEXT: vl8r.v v24, (a6) # vscale x 64-byte Folded Reload +; RV32-NEXT: vl8r.v v8, (a6) # vscale x 64-byte Folded Reload ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmerge.vvm v4, v8, v24, v0 +; RV32-NEXT: vmerge.vvm v8, v24, v8, v0 +; RV32-NEXT: csrr a6, vlenb +; RV32-NEXT: li a7, 44 +; RV32-NEXT: mul a6, a6, a7 +; RV32-NEXT: add a6, sp, a6 +; RV32-NEXT: addi a6, a6, 16 +; RV32-NEXT: vs4r.v v8, (a6) # vscale x 32-byte Folded Spill ; RV32-NEXT: vmv.s.x v0, a5 -; RV32-NEXT: addi a3, a3, 12 -; RV32-NEXT: csrr a5, vlenb -; RV32-NEXT: li a6, 80 -; RV32-NEXT: mul a5, a5, a6 -; RV32-NEXT: add a5, sp, a5 -; RV32-NEXT: addi a5, a5, 16 -; RV32-NEXT: vl8r.v v16, (a5) # vscale x 64-byte Folded Reload -; RV32-NEXT: csrr a5, vlenb -; RV32-NEXT: li a6, 72 -; RV32-NEXT: mul a5, a5, a6 -; RV32-NEXT: add a5, sp, a5 -; RV32-NEXT: addi a5, a5, 16 -; RV32-NEXT: vl8r.v v8, (a5) # vscale x 64-byte Folded Reload +; RV32-NEXT: vle16.v v6, (a1) +; RV32-NEXT: vle16.v v2, (a4) +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a4, 84 +; RV32-NEXT: mul a1, a1, a4 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vmerge.vvm v8, v8, v16, v0 -; RV32-NEXT: csrr a5, vlenb -; RV32-NEXT: slli a5, a5, 5 -; RV32-NEXT: add a5, sp, a5 -; RV32-NEXT: addi a5, a5, 16 -; RV32-NEXT: vs8r.v v8, (a5) # vscale x 64-byte Folded Spill -; RV32-NEXT: vmv.s.x v0, a3 -; RV32-NEXT: addi a3, t0, 768 -; RV32-NEXT: csrr a5, vlenb -; RV32-NEXT: li a6, 88 -; RV32-NEXT: mul a5, a5, a6 -; RV32-NEXT: add a5, sp, a5 -; RV32-NEXT: addi a5, a5, 16 -; RV32-NEXT: vl8r.v v8, (a5) # vscale x 64-byte Folded Reload -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmerge.vvm v8, v8, v24, v0 -; RV32-NEXT: csrr a5, vlenb -; RV32-NEXT: li a6, 28 -; RV32-NEXT: mul a5, a5, a6 -; RV32-NEXT: add a5, sp, a5 -; RV32-NEXT: addi a5, a5, 16 -; RV32-NEXT: vs4r.v v8, (a5) # vscale x 32-byte Folded Spill -; RV32-NEXT: vmv.s.x v0, a3 -; RV32-NEXT: lui a3, 3073 -; RV32-NEXT: csrr a5, vlenb -; RV32-NEXT: li a6, 80 -; RV32-NEXT: mul a5, a5, a6 -; RV32-NEXT: add a5, sp, a5 -; RV32-NEXT: addi a5, a5, 16 -; RV32-NEXT: vl8r.v v16, (a5) # vscale x 64-byte Folded Reload -; RV32-NEXT: csrr a5, vlenb -; RV32-NEXT: li a6, 72 -; RV32-NEXT: mul a5, a5, a6 -; RV32-NEXT: add a5, sp, a5 -; RV32-NEXT: addi a5, a5, 16 -; RV32-NEXT: vl8r.v v8, (a5) # vscale x 64-byte Folded Reload -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmerge.vvm v16, v8, v16, v0 -; RV32-NEXT: csrr a5, vlenb -; RV32-NEXT: slli a5, a5, 4 -; RV32-NEXT: add a5, sp, a5 -; RV32-NEXT: addi a5, a5, 16 -; RV32-NEXT: vs8r.v v16, (a5) # vscale x 64-byte Folded Spill -; RV32-NEXT: vmv.s.x v0, a4 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vle16.v v28, (a1) -; RV32-NEXT: addi a1, a3, -1024 -; RV32-NEXT: vmv4r.v v8, v24 -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 88 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vl8r.v v16, (a3) # vscale x 64-byte Folded Reload -; RV32-NEXT: vmerge.vvm v16, v16, v24, v0 -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 24 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs4r.v v16, (a3) # vscale x 32-byte Folded Spill -; RV32-NEXT: vmv.s.x v0, a1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: li a4, 12 +; RV32-NEXT: mul a1, a1, a4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload -; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; RV32-NEXT: vrgatherei16.vv v8, v16, v28 -; RV32-NEXT: addi a1, sp, 16 ; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill +; RV32-NEXT: vmv.s.x v0, a3 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 80 +; RV32-NEXT: li a3, 36 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload +; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vrgatherei16.vv v24, v8, v6 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 72 +; RV32-NEXT: slli a1, a1, 2 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a3, 92 +; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a3, 76 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmerge.vvm v16, v8, v16, v0 +; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; RV32-NEXT: vmerge.vvm v8, v24, v8, v0 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 80 -; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: li a3, 92 +; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a3, 20 +; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill -; RV32-NEXT: lui a1, %hi(.LCPI8_3) -; RV32-NEXT: addi a1, a1, %lo(.LCPI8_3) -; RV32-NEXT: li a2, 192 -; RV32-NEXT: vmv.s.x v0, a2 +; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload +; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32-NEXT: vrgatherei16.vv v24, v8, v2 +; RV32-NEXT: lui a1, %hi(.LCPI8_2) +; RV32-NEXT: addi a1, a1, %lo(.LCPI8_2) +; RV32-NEXT: lui a3, 3073 +; RV32-NEXT: addi a3, a3, -1024 +; RV32-NEXT: vmv.s.x v0, a3 ; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vle16.v v12, (a1) +; RV32-NEXT: vle16.v v3, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 88 -; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: li a3, 84 +; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmerge.vvm v8, v16, v24, v0 +; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; RV32-NEXT: vmerge.vvm v8, v8, v16, v0 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 88 +; RV32-NEXT: li a2, 84 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill +; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 68 +; RV32-NEXT: li a2, 72 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v16, (a1) # vscale x 32-byte Folded Reload +; RV32-NEXT: vl4r.v v28, (a1) # vscale x 32-byte Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 48 +; RV32-NEXT: li a2, 52 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload +; RV32-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload ; RV32-NEXT: vsetivli zero, 12, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v16, v24 +; RV32-NEXT: vmv.v.v v28, v16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 68 +; RV32-NEXT: li a2, 72 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v16, (a1) # vscale x 32-byte Folded Spill +; RV32-NEXT: vs4r.v v28, (a1) # vscale x 32-byte Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 6 +; RV32-NEXT: li a2, 60 +; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl4r.v v16, (a1) # vscale x 32-byte Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 40 -; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: slli a1, a1, 2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload -; RV32-NEXT: vmv.v.v v16, v24 +; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload +; RV32-NEXT: vmv.v.v v16, v8 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 6 +; RV32-NEXT: li a2, 60 +; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs4r.v v16, (a1) # vscale x 32-byte Folded Spill -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV32-NEXT: vrgatherei16.vv v20, v4, v12 ; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload +; RV32-NEXT: vl4r.v v8, (a1) # vscale x 32-byte Folded Reload +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vrgatherei16.vv v28, v8, v3 ; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v20, v8 +; RV32-NEXT: vmv.v.v v28, v24 ; RV32-NEXT: lui a1, %hi(.LCPI8_4) ; RV32-NEXT: addi a1, a1, %lo(.LCPI8_4) ; RV32-NEXT: lui a2, %hi(.LCPI8_5) ; RV32-NEXT: addi a2, a2, %lo(.LCPI8_5) ; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV32-NEXT: vle16.v v16, (a1) -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RV32-NEXT: vle16.v v24, (a2) -; RV32-NEXT: lui a1, %hi(.LCPI8_6) -; RV32-NEXT: addi a1, a1, %lo(.LCPI8_6) +; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV32-NEXT: vle16.v v8, (a1) +; RV32-NEXT: lui a1, %hi(.LCPI8_7) +; RV32-NEXT: addi a1, a1, %lo(.LCPI8_7) ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vle16.v v18, (a1) +; RV32-NEXT: vle16.v v10, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 5 +; RV32-NEXT: li a2, 28 +; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v0, (a1) # vscale x 64-byte Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v0, v16 +; RV32-NEXT: vrgatherei16.vv v16, v0, v24 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 28 +; RV32-NEXT: li a2, 48 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v12, (a1) # vscale x 32-byte Folded Reload +; RV32-NEXT: vl4r.v v20, (a1) # vscale x 32-byte Folded Reload ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV32-NEXT: vrgatherei16.vv v0, v12, v24 +; RV32-NEXT: vrgatherei16.vv v24, v20, v8 ; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v0, v8 +; RV32-NEXT: vmv.v.v v24, v16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: li a2, 12 +; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload +; RV32-NEXT: vl8r.v v0, (a1) # vscale x 64-byte Folded Reload ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vrgatherei16.vv v8, v24, v18 -; RV32-NEXT: lui a1, %hi(.LCPI8_7) -; RV32-NEXT: addi a1, a1, %lo(.LCPI8_7) +; RV32-NEXT: vrgatherei16.vv v16, v0, v10 +; RV32-NEXT: lui a1, %hi(.LCPI8_6) +; RV32-NEXT: addi a1, a1, %lo(.LCPI8_6) ; RV32-NEXT: lui a2, %hi(.LCPI8_8) ; RV32-NEXT: addi a2, a2, %lo(.LCPI8_8) ; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vle16.v v12, (a1) +; RV32-NEXT: vle16.v v4, (a1) ; RV32-NEXT: lui a1, %hi(.LCPI8_9) ; RV32-NEXT: addi a1, a1, %lo(.LCPI8_9) ; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV32-NEXT: vle16.v v14, (a2) -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 72 -; RV32-NEXT: mul a2, a2, a3 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 16 -; RV32-NEXT: vs2r.v v14, (a2) # vscale x 16-byte Folded Spill +; RV32-NEXT: vle16.v v6, (a1) ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV32-NEXT: vle16.v v13, (a1) +; RV32-NEXT: vle16.v v5, (a2) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 56 +; RV32-NEXT: li a2, 44 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs1r.v v13, (a1) # vscale x 8-byte Folded Spill -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 24 -; RV32-NEXT: mul a1, a1, a2 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v4, (a1) # vscale x 32-byte Folded Reload -; RV32-NEXT: vrgatherei16.vv v16, v4, v12 +; RV32-NEXT: vl4r.v v20, (a1) # vscale x 32-byte Folded Reload +; RV32-NEXT: vrgatherei16.vv v0, v20, v4 ; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v16, v8 +; RV32-NEXT: vmv.v.v v0, v16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 80 +; RV32-NEXT: li a2, 84 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 72 -; RV32-NEXT: mul a1, a1, a2 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl2r.v v6, (a1) # vscale x 16-byte Folded Reload +; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vrgatherei16.vv v8, v24, v6 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 88 -; RV32-NEXT: mul a1, a1, a2 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v24, (a1) # vscale x 32-byte Folded Reload +; RV32-NEXT: vrgatherei16.vv v16, v8, v6 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 56 +; RV32-NEXT: li a2, 92 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl1r.v v28, (a1) # vscale x 8-byte Folded Reload +; RV32-NEXT: vl4r.v v12, (a1) # vscale x 32-byte Folded Reload ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV32-NEXT: vrgatherei16.vv v12, v24, v28 +; RV32-NEXT: vrgatherei16.vv v8, v12, v5 ; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v12, v8 +; RV32-NEXT: vmv.v.v v8, v16 ; RV32-NEXT: addi a1, a0, 320 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vse32.v v12, (a1) +; RV32-NEXT: vse32.v v8, (a1) ; RV32-NEXT: addi a1, a0, 256 -; RV32-NEXT: vse32.v v16, (a1) -; RV32-NEXT: addi a1, a0, 192 ; RV32-NEXT: vse32.v v0, (a1) +; RV32-NEXT: addi a1, a0, 192 +; RV32-NEXT: vse32.v v24, (a1) ; RV32-NEXT: addi a1, a0, 128 -; RV32-NEXT: vse32.v v20, (a1) +; RV32-NEXT: vse32.v v28, (a1) ; RV32-NEXT: addi a1, a0, 64 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 6 +; RV32-NEXT: li a3, 60 +; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 16 ; RV32-NEXT: vl4r.v v8, (a2) # vscale x 32-byte Folded Reload ; RV32-NEXT: vse32.v v8, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 68 +; RV32-NEXT: li a2, 72 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl4r.v v8, (a1) # vscale x 32-byte Folded Reload ; RV32-NEXT: vse32.v v8, (a0) ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 96 +; RV32-NEXT: li a1, 100 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: .cfi_def_cfa sp, 16 @@ -1073,16 +1064,16 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: vrgatherei16.vv v24, v0, v20 ; RV64-NEXT: vsetivli zero, 5, e64, m4, tu, ma ; RV64-NEXT: vmv.v.v v8, v24 +; RV64-NEXT: addi a1, a0, 320 +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64-NEXT: vse64.v v8, (a1) ; RV64-NEXT: addi a1, a0, 256 ; RV64-NEXT: csrr a2, vlenb ; RV64-NEXT: li a3, 61 ; RV64-NEXT: mul a2, a2, a3 ; RV64-NEXT: add a2, sp, a2 ; RV64-NEXT: addi a2, a2, 16 -; RV64-NEXT: vl4r.v v20, (a2) # vscale x 32-byte Folded Reload -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV64-NEXT: vse64.v v20, (a1) -; RV64-NEXT: addi a1, a0, 320 +; RV64-NEXT: vl4r.v v8, (a2) # vscale x 32-byte Folded Reload ; RV64-NEXT: vse64.v v8, (a1) ; RV64-NEXT: addi a1, a0, 192 ; RV64-NEXT: vse64.v v12, (a1) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll index a9f6392800012..76eca8e034303 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll @@ -730,46 +730,69 @@ define <8 x i8> @mgather_baseidx_v8i8(ptr %base, <8 x i8> %idxs, <8 x i1> %m, <8 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1 ; RV64ZVE32F-NEXT: .LBB12_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB12_14 -; RV64ZVE32F-NEXT: # %bb.5: # %else5 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB12_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lbu a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v11, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e8, mf2, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-NEXT: .LBB12_6: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB12_15 -; RV64ZVE32F-NEXT: .LBB12_6: # %else8 +; RV64ZVE32F-NEXT: beqz a2, .LBB12_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lbu a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3 +; RV64ZVE32F-NEXT: .LBB12_8: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB12_16 -; RV64ZVE32F-NEXT: .LBB12_7: # %else11 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB12_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lbu a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e8, mf2, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4 +; RV64ZVE32F-NEXT: .LBB12_10: # %else11 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB12_9 -; RV64ZVE32F-NEXT: .LBB12_8: # %cond.load13 +; RV64ZVE32F-NEXT: beqz a2, .LBB12_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e8, mf2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 5 -; RV64ZVE32F-NEXT: .LBB12_9: # %else14 +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 5 +; RV64ZVE32F-NEXT: .LBB12_12: # %else14 ; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB12_11 -; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB12_14 +; RV64ZVE32F-NEXT: # %bb.13: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e8, mf2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 6 -; RV64ZVE32F-NEXT: .LBB12_11: # %else17 +; RV64ZVE32F-NEXT: .LBB12_14: # %else17 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB12_13 -; RV64ZVE32F-NEXT: # %bb.12: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a1, .LBB12_16 +; RV64ZVE32F-NEXT: # %bb.15: # %cond.load19 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v8 @@ -778,40 +801,10 @@ define <8 x i8> @mgather_baseidx_v8i8(ptr %base, <8 x i8> %idxs, <8 x i1> %m, <8 ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 7 -; RV64ZVE32F-NEXT: .LBB12_13: # %else20 +; RV64ZVE32F-NEXT: .LBB12_16: # %else20 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vmv1r.v v8, v9 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB12_14: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v11, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e8, mf2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB12_6 -; RV64ZVE32F-NEXT: .LBB12_15: # %cond.load7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v8, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 3 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB12_7 -; RV64ZVE32F-NEXT: .LBB12_16: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 5, e8, mf2, tu, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v8, a2 -; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB12_8 -; RV64ZVE32F-NEXT: j .LBB12_9 %ptrs = getelementptr inbounds i8, ptr %base, <8 x i8> %idxs %v = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> %ptrs, i32 1, <8 x i1> %m, <8 x i8> %passthru) ret <8 x i8> %v @@ -1431,38 +1424,67 @@ define <8 x i16> @mgather_baseidx_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8 x i1> ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1 ; RV64ZVE32F-NEXT: .LBB23_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB23_14 -; RV64ZVE32F-NEXT: # %bb.5: # %else5 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB23_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vmv.s.x v11, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-NEXT: .LBB23_6: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB23_15 -; RV64ZVE32F-NEXT: .LBB23_6: # %else8 +; RV64ZVE32F-NEXT: beqz a2, .LBB23_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3 +; RV64ZVE32F-NEXT: .LBB23_8: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB23_16 -; RV64ZVE32F-NEXT: .LBB23_7: # %else11 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB23_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4 +; RV64ZVE32F-NEXT: .LBB23_10: # %else11 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB23_9 -; RV64ZVE32F-NEXT: .LBB23_8: # %cond.load13 +; RV64ZVE32F-NEXT: beqz a2, .LBB23_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) ; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 5 -; RV64ZVE32F-NEXT: .LBB23_9: # %else14 +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 5 +; RV64ZVE32F-NEXT: .LBB23_12: # %else14 ; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB23_11 -; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB23_14 +; RV64ZVE32F-NEXT: # %bb.13: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -1471,10 +1493,10 @@ define <8 x i16> @mgather_baseidx_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8 x i1> ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 6 -; RV64ZVE32F-NEXT: .LBB23_11: # %else17 +; RV64ZVE32F-NEXT: .LBB23_14: # %else17 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB23_13 -; RV64ZVE32F-NEXT: # %bb.12: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a1, .LBB23_16 +; RV64ZVE32F-NEXT: # %bb.15: # %cond.load19 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v8 @@ -1485,47 +1507,10 @@ define <8 x i16> @mgather_baseidx_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8 x i1> ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 7 -; RV64ZVE32F-NEXT: .LBB23_13: # %else20 +; RV64ZVE32F-NEXT: .LBB23_16: # %else20 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vmv1r.v v8, v9 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB23_14: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v11, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB23_6 -; RV64ZVE32F-NEXT: .LBB23_15: # %cond.load7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v8, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 3 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB23_7 -; RV64ZVE32F-NEXT: .LBB23_16: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v8, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB23_8 -; RV64ZVE32F-NEXT: j .LBB23_9 %ptrs = getelementptr inbounds i16, ptr %base, <8 x i8> %idxs %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 2, <8 x i1> %m, <8 x i16> %passthru) ret <8 x i16> %v @@ -1580,38 +1565,67 @@ define <8 x i16> @mgather_baseidx_sext_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1 ; RV64ZVE32F-NEXT: .LBB24_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB24_14 -; RV64ZVE32F-NEXT: # %bb.5: # %else5 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB24_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vmv.s.x v11, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-NEXT: .LBB24_6: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB24_15 -; RV64ZVE32F-NEXT: .LBB24_6: # %else8 +; RV64ZVE32F-NEXT: beqz a2, .LBB24_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3 +; RV64ZVE32F-NEXT: .LBB24_8: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB24_16 -; RV64ZVE32F-NEXT: .LBB24_7: # %else11 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB24_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4 +; RV64ZVE32F-NEXT: .LBB24_10: # %else11 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB24_9 -; RV64ZVE32F-NEXT: .LBB24_8: # %cond.load13 +; RV64ZVE32F-NEXT: beqz a2, .LBB24_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) ; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 5 -; RV64ZVE32F-NEXT: .LBB24_9: # %else14 +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 5 +; RV64ZVE32F-NEXT: .LBB24_12: # %else14 ; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB24_11 -; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB24_14 +; RV64ZVE32F-NEXT: # %bb.13: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -1620,10 +1634,10 @@ define <8 x i16> @mgather_baseidx_sext_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 6 -; RV64ZVE32F-NEXT: .LBB24_11: # %else17 +; RV64ZVE32F-NEXT: .LBB24_14: # %else17 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB24_13 -; RV64ZVE32F-NEXT: # %bb.12: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a1, .LBB24_16 +; RV64ZVE32F-NEXT: # %bb.15: # %cond.load19 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v8 @@ -1634,47 +1648,10 @@ define <8 x i16> @mgather_baseidx_sext_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 7 -; RV64ZVE32F-NEXT: .LBB24_13: # %else20 +; RV64ZVE32F-NEXT: .LBB24_16: # %else20 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vmv1r.v v8, v9 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB24_14: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v11, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB24_6 -; RV64ZVE32F-NEXT: .LBB24_15: # %cond.load7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v8, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 3 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB24_7 -; RV64ZVE32F-NEXT: .LBB24_16: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v8, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB24_8 -; RV64ZVE32F-NEXT: j .LBB24_9 %eidxs = sext <8 x i8> %idxs to <8 x i16> %ptrs = getelementptr inbounds i16, ptr %base, <8 x i16> %eidxs %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 2, <8 x i1> %m, <8 x i16> %passthru) @@ -1730,39 +1707,71 @@ define <8 x i16> @mgather_baseidx_zext_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1 ; RV64ZVE32F-NEXT: .LBB25_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB25_14 -; RV64ZVE32F-NEXT: # %bb.5: # %else5 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB25_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: zext.b a2, a2 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vmv.s.x v11, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-NEXT: .LBB25_6: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB25_15 -; RV64ZVE32F-NEXT: .LBB25_6: # %else8 +; RV64ZVE32F-NEXT: beqz a2, .LBB25_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: zext.b a2, a2 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3 +; RV64ZVE32F-NEXT: .LBB25_8: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB25_16 -; RV64ZVE32F-NEXT: .LBB25_7: # %else11 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB25_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: zext.b a2, a2 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4 +; RV64ZVE32F-NEXT: .LBB25_10: # %else11 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB25_9 -; RV64ZVE32F-NEXT: .LBB25_8: # %cond.load13 +; RV64ZVE32F-NEXT: beqz a2, .LBB25_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: zext.b a2, a2 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) ; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 5 -; RV64ZVE32F-NEXT: .LBB25_9: # %else14 +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 5 +; RV64ZVE32F-NEXT: .LBB25_12: # %else14 ; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB25_11 -; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB25_14 +; RV64ZVE32F-NEXT: # %bb.13: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: zext.b a2, a2 ; RV64ZVE32F-NEXT: slli a2, a2, 1 @@ -1772,10 +1781,10 @@ define <8 x i16> @mgather_baseidx_zext_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 6 -; RV64ZVE32F-NEXT: .LBB25_11: # %else17 +; RV64ZVE32F-NEXT: .LBB25_14: # %else17 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB25_13 -; RV64ZVE32F-NEXT: # %bb.12: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a1, .LBB25_16 +; RV64ZVE32F-NEXT: # %bb.15: # %cond.load19 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v8 @@ -1787,50 +1796,10 @@ define <8 x i16> @mgather_baseidx_zext_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 7 -; RV64ZVE32F-NEXT: .LBB25_13: # %else20 +; RV64ZVE32F-NEXT: .LBB25_16: # %else20 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vmv1r.v v8, v9 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB25_14: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: zext.b a2, a2 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v11, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB25_6 -; RV64ZVE32F-NEXT: .LBB25_15: # %cond.load7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: zext.b a2, a2 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v8, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 3 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB25_7 -; RV64ZVE32F-NEXT: .LBB25_16: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: zext.b a2, a2 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v8, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB25_8 -; RV64ZVE32F-NEXT: j .LBB25_9 %eidxs = zext <8 x i8> %idxs to <8 x i16> %ptrs = getelementptr inbounds i16, ptr %base, <8 x i16> %eidxs %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 2, <8 x i1> %m, <8 x i16> %passthru) @@ -1883,37 +1852,63 @@ define <8 x i16> @mgather_baseidx_v8i16(ptr %base, <8 x i16> %idxs, <8 x i1> %m, ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1 ; RV64ZVE32F-NEXT: .LBB26_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB26_14 -; RV64ZVE32F-NEXT: # %bb.5: # %else5 -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB26_15 -; RV64ZVE32F-NEXT: .LBB26_6: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB26_16 -; RV64ZVE32F-NEXT: .LBB26_7: # %else11 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB26_9 -; RV64ZVE32F-NEXT: .LBB26_8: # %cond.load13 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB26_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: vmv.s.x v11, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-NEXT: .LBB26_6: # %else5 +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB26_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3 +; RV64ZVE32F-NEXT: .LBB26_8: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB26_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4 +; RV64ZVE32F-NEXT: .LBB26_10: # %else11 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB26_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 5 -; RV64ZVE32F-NEXT: .LBB26_9: # %else14 +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 5 +; RV64ZVE32F-NEXT: .LBB26_12: # %else14 ; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB26_11 -; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB26_14 +; RV64ZVE32F-NEXT: # %bb.13: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -1921,10 +1916,10 @@ define <8 x i16> @mgather_baseidx_v8i16(ptr %base, <8 x i16> %idxs, <8 x i1> %m, ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 6 -; RV64ZVE32F-NEXT: .LBB26_11: # %else17 +; RV64ZVE32F-NEXT: .LBB26_14: # %else17 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB26_13 -; RV64ZVE32F-NEXT: # %bb.12: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a1, .LBB26_16 +; RV64ZVE32F-NEXT: # %bb.15: # %cond.load19 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v8 @@ -1934,43 +1929,10 @@ define <8 x i16> @mgather_baseidx_v8i16(ptr %base, <8 x i16> %idxs, <8 x i1> %m, ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 7 -; RV64ZVE32F-NEXT: .LBB26_13: # %else20 +; RV64ZVE32F-NEXT: .LBB26_16: # %else20 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vmv1r.v v8, v9 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB26_14: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v11, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB26_6 -; RV64ZVE32F-NEXT: .LBB26_15: # %cond.load7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v8, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 3 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB26_7 -; RV64ZVE32F-NEXT: .LBB26_16: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v8, a2 -; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB26_8 -; RV64ZVE32F-NEXT: j .LBB26_9 %ptrs = getelementptr inbounds i16, ptr %base, <8 x i16> %idxs %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 2, <8 x i1> %m, <8 x i16> %passthru) ret <8 x i16> %v @@ -2478,25 +2440,54 @@ define <8 x i32> @mgather_baseidx_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8 x i1> ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 1 ; RV64ZVE32F-NEXT: .LBB35_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB35_14 -; RV64ZVE32F-NEXT: # %bb.5: # %else5 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB35_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: .LBB35_6: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB35_15 -; RV64ZVE32F-NEXT: .LBB35_6: # %else8 +; RV64ZVE32F-NEXT: beqz a2, .LBB35_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vmv.s.x v9, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 3 +; RV64ZVE32F-NEXT: .LBB35_8: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB35_16 -; RV64ZVE32F-NEXT: .LBB35_7: # %else11 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB35_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: .LBB35_10: # %else11 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB35_9 -; RV64ZVE32F-NEXT: .LBB35_8: # %cond.load13 +; RV64ZVE32F-NEXT: beqz a2, .LBB35_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) @@ -2504,12 +2495,12 @@ define <8 x i32> @mgather_baseidx_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8 x i1> ; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 -; RV64ZVE32F-NEXT: .LBB35_9: # %else14 +; RV64ZVE32F-NEXT: .LBB35_12: # %else14 ; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB35_11 -; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB35_14 +; RV64ZVE32F-NEXT: # %bb.13: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -2518,10 +2509,10 @@ define <8 x i32> @mgather_baseidx_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8 x i1> ; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 -; RV64ZVE32F-NEXT: .LBB35_11: # %else17 +; RV64ZVE32F-NEXT: .LBB35_14: # %else17 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB35_13 -; RV64ZVE32F-NEXT: # %bb.12: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a1, .LBB35_16 +; RV64ZVE32F-NEXT: # %bb.15: # %cond.load19 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v8 @@ -2532,47 +2523,10 @@ define <8 x i32> @mgather_baseidx_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8 x i1> ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 7 -; RV64ZVE32F-NEXT: .LBB35_13: # %else20 +; RV64ZVE32F-NEXT: .LBB35_16: # %else20 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vmv2r.v v8, v10 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB35_14: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB35_6 -; RV64ZVE32F-NEXT: .LBB35_15: # %cond.load7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v8, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 3 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB35_7 -; RV64ZVE32F-NEXT: .LBB35_16: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB35_8 -; RV64ZVE32F-NEXT: j .LBB35_9 %ptrs = getelementptr inbounds i32, ptr %base, <8 x i8> %idxs %v = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> %m, <8 x i32> %passthru) ret <8 x i32> %v @@ -2626,25 +2580,54 @@ define <8 x i32> @mgather_baseidx_sext_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 1 ; RV64ZVE32F-NEXT: .LBB36_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB36_14 -; RV64ZVE32F-NEXT: # %bb.5: # %else5 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB36_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: .LBB36_6: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB36_15 -; RV64ZVE32F-NEXT: .LBB36_6: # %else8 +; RV64ZVE32F-NEXT: beqz a2, .LBB36_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vmv.s.x v9, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 3 +; RV64ZVE32F-NEXT: .LBB36_8: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB36_16 -; RV64ZVE32F-NEXT: .LBB36_7: # %else11 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB36_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: .LBB36_10: # %else11 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB36_9 -; RV64ZVE32F-NEXT: .LBB36_8: # %cond.load13 +; RV64ZVE32F-NEXT: beqz a2, .LBB36_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) @@ -2652,12 +2635,12 @@ define <8 x i32> @mgather_baseidx_sext_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 -; RV64ZVE32F-NEXT: .LBB36_9: # %else14 +; RV64ZVE32F-NEXT: .LBB36_12: # %else14 ; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB36_11 -; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB36_14 +; RV64ZVE32F-NEXT: # %bb.13: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -2666,10 +2649,10 @@ define <8 x i32> @mgather_baseidx_sext_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 -; RV64ZVE32F-NEXT: .LBB36_11: # %else17 +; RV64ZVE32F-NEXT: .LBB36_14: # %else17 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB36_13 -; RV64ZVE32F-NEXT: # %bb.12: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a1, .LBB36_16 +; RV64ZVE32F-NEXT: # %bb.15: # %cond.load19 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v8 @@ -2680,47 +2663,10 @@ define <8 x i32> @mgather_baseidx_sext_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 7 -; RV64ZVE32F-NEXT: .LBB36_13: # %else20 +; RV64ZVE32F-NEXT: .LBB36_16: # %else20 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vmv2r.v v8, v10 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB36_14: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB36_6 -; RV64ZVE32F-NEXT: .LBB36_15: # %cond.load7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v8, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 3 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB36_7 -; RV64ZVE32F-NEXT: .LBB36_16: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB36_8 -; RV64ZVE32F-NEXT: j .LBB36_9 %eidxs = sext <8 x i8> %idxs to <8 x i32> %ptrs = getelementptr inbounds i32, ptr %base, <8 x i32> %eidxs %v = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> %m, <8 x i32> %passthru) @@ -2778,25 +2724,57 @@ define <8 x i32> @mgather_baseidx_zext_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 1 ; RV64ZVE32F-NEXT: .LBB37_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB37_14 -; RV64ZVE32F-NEXT: # %bb.5: # %else5 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB37_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: zext.b a2, a2 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: .LBB37_6: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB37_15 -; RV64ZVE32F-NEXT: .LBB37_6: # %else8 +; RV64ZVE32F-NEXT: beqz a2, .LBB37_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: zext.b a2, a2 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vmv.s.x v9, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 3 +; RV64ZVE32F-NEXT: .LBB37_8: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB37_16 -; RV64ZVE32F-NEXT: .LBB37_7: # %else11 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB37_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: zext.b a2, a2 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: .LBB37_10: # %else11 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB37_9 -; RV64ZVE32F-NEXT: .LBB37_8: # %cond.load13 +; RV64ZVE32F-NEXT: beqz a2, .LBB37_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: zext.b a2, a2 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -2805,12 +2783,12 @@ define <8 x i32> @mgather_baseidx_zext_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 -; RV64ZVE32F-NEXT: .LBB37_9: # %else14 +; RV64ZVE32F-NEXT: .LBB37_12: # %else14 ; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB37_11 -; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB37_14 +; RV64ZVE32F-NEXT: # %bb.13: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: zext.b a2, a2 ; RV64ZVE32F-NEXT: slli a2, a2, 2 @@ -2820,10 +2798,10 @@ define <8 x i32> @mgather_baseidx_zext_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 -; RV64ZVE32F-NEXT: .LBB37_11: # %else17 +; RV64ZVE32F-NEXT: .LBB37_14: # %else17 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB37_13 -; RV64ZVE32F-NEXT: # %bb.12: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a1, .LBB37_16 +; RV64ZVE32F-NEXT: # %bb.15: # %cond.load19 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v8 @@ -2835,50 +2813,10 @@ define <8 x i32> @mgather_baseidx_zext_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 7 -; RV64ZVE32F-NEXT: .LBB37_13: # %else20 +; RV64ZVE32F-NEXT: .LBB37_16: # %else20 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vmv2r.v v8, v10 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB37_14: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: zext.b a2, a2 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB37_6 -; RV64ZVE32F-NEXT: .LBB37_15: # %cond.load7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: zext.b a2, a2 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v8, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 3 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB37_7 -; RV64ZVE32F-NEXT: .LBB37_16: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: zext.b a2, a2 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB37_8 -; RV64ZVE32F-NEXT: j .LBB37_9 %eidxs = zext <8 x i8> %idxs to <8 x i32> %ptrs = getelementptr inbounds i32, ptr %base, <8 x i32> %eidxs %v = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> %m, <8 x i32> %passthru) @@ -2935,25 +2873,54 @@ define <8 x i32> @mgather_baseidx_v8i16_v8i32(ptr %base, <8 x i16> %idxs, <8 x i ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 1 ; RV64ZVE32F-NEXT: .LBB38_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB38_14 -; RV64ZVE32F-NEXT: # %bb.5: # %else5 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB38_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: .LBB38_6: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB38_15 -; RV64ZVE32F-NEXT: .LBB38_6: # %else8 +; RV64ZVE32F-NEXT: beqz a2, .LBB38_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vmv.s.x v9, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 3 +; RV64ZVE32F-NEXT: .LBB38_8: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB38_16 -; RV64ZVE32F-NEXT: .LBB38_7: # %else11 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB38_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: .LBB38_10: # %else11 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB38_9 -; RV64ZVE32F-NEXT: .LBB38_8: # %cond.load13 +; RV64ZVE32F-NEXT: beqz a2, .LBB38_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) @@ -2961,12 +2928,12 @@ define <8 x i32> @mgather_baseidx_v8i16_v8i32(ptr %base, <8 x i16> %idxs, <8 x i ; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 -; RV64ZVE32F-NEXT: .LBB38_9: # %else14 +; RV64ZVE32F-NEXT: .LBB38_12: # %else14 ; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB38_11 -; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB38_14 +; RV64ZVE32F-NEXT: # %bb.13: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -2975,10 +2942,10 @@ define <8 x i32> @mgather_baseidx_v8i16_v8i32(ptr %base, <8 x i16> %idxs, <8 x i ; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 -; RV64ZVE32F-NEXT: .LBB38_11: # %else17 +; RV64ZVE32F-NEXT: .LBB38_14: # %else17 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB38_13 -; RV64ZVE32F-NEXT: # %bb.12: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a1, .LBB38_16 +; RV64ZVE32F-NEXT: # %bb.15: # %cond.load19 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v8 @@ -2989,47 +2956,10 @@ define <8 x i32> @mgather_baseidx_v8i16_v8i32(ptr %base, <8 x i16> %idxs, <8 x i ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 7 -; RV64ZVE32F-NEXT: .LBB38_13: # %else20 +; RV64ZVE32F-NEXT: .LBB38_16: # %else20 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vmv2r.v v8, v10 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB38_14: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB38_6 -; RV64ZVE32F-NEXT: .LBB38_15: # %cond.load7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v8, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 3 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB38_7 -; RV64ZVE32F-NEXT: .LBB38_16: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB38_8 -; RV64ZVE32F-NEXT: j .LBB38_9 %ptrs = getelementptr inbounds i32, ptr %base, <8 x i16> %idxs %v = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> %m, <8 x i32> %passthru) ret <8 x i32> %v @@ -3085,25 +3015,54 @@ define <8 x i32> @mgather_baseidx_sext_v8i16_v8i32(ptr %base, <8 x i16> %idxs, < ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 1 ; RV64ZVE32F-NEXT: .LBB39_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB39_14 -; RV64ZVE32F-NEXT: # %bb.5: # %else5 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB39_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: .LBB39_6: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB39_15 -; RV64ZVE32F-NEXT: .LBB39_6: # %else8 +; RV64ZVE32F-NEXT: beqz a2, .LBB39_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vmv.s.x v9, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 3 +; RV64ZVE32F-NEXT: .LBB39_8: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB39_16 -; RV64ZVE32F-NEXT: .LBB39_7: # %else11 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB39_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: .LBB39_10: # %else11 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB39_9 -; RV64ZVE32F-NEXT: .LBB39_8: # %cond.load13 +; RV64ZVE32F-NEXT: beqz a2, .LBB39_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) @@ -3111,12 +3070,12 @@ define <8 x i32> @mgather_baseidx_sext_v8i16_v8i32(ptr %base, <8 x i16> %idxs, < ; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 -; RV64ZVE32F-NEXT: .LBB39_9: # %else14 +; RV64ZVE32F-NEXT: .LBB39_12: # %else14 ; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB39_11 -; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB39_14 +; RV64ZVE32F-NEXT: # %bb.13: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -3125,10 +3084,10 @@ define <8 x i32> @mgather_baseidx_sext_v8i16_v8i32(ptr %base, <8 x i16> %idxs, < ; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 -; RV64ZVE32F-NEXT: .LBB39_11: # %else17 +; RV64ZVE32F-NEXT: .LBB39_14: # %else17 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB39_13 -; RV64ZVE32F-NEXT: # %bb.12: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a1, .LBB39_16 +; RV64ZVE32F-NEXT: # %bb.15: # %cond.load19 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v8 @@ -3139,47 +3098,10 @@ define <8 x i32> @mgather_baseidx_sext_v8i16_v8i32(ptr %base, <8 x i16> %idxs, < ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 7 -; RV64ZVE32F-NEXT: .LBB39_13: # %else20 +; RV64ZVE32F-NEXT: .LBB39_16: # %else20 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vmv2r.v v8, v10 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB39_14: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB39_6 -; RV64ZVE32F-NEXT: .LBB39_15: # %cond.load7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v8, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 3 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB39_7 -; RV64ZVE32F-NEXT: .LBB39_16: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB39_8 -; RV64ZVE32F-NEXT: j .LBB39_9 %eidxs = sext <8 x i16> %idxs to <8 x i32> %ptrs = getelementptr inbounds i32, ptr %base, <8 x i32> %eidxs %v = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> %m, <8 x i32> %passthru) @@ -3238,25 +3160,57 @@ define <8 x i32> @mgather_baseidx_zext_v8i16_v8i32(ptr %base, <8 x i16> %idxs, < ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 1 ; RV64ZVE32F-NEXT: .LBB40_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB40_14 -; RV64ZVE32F-NEXT: # %bb.5: # %else5 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB40_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 48 +; RV64ZVE32F-NEXT: srli a2, a2, 46 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: .LBB40_6: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB40_15 -; RV64ZVE32F-NEXT: .LBB40_6: # %else8 +; RV64ZVE32F-NEXT: beqz a2, .LBB40_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 48 +; RV64ZVE32F-NEXT: srli a2, a2, 46 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vmv.s.x v9, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 3 +; RV64ZVE32F-NEXT: .LBB40_8: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB40_16 -; RV64ZVE32F-NEXT: .LBB40_7: # %else11 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB40_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 48 +; RV64ZVE32F-NEXT: srli a2, a2, 46 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: .LBB40_10: # %else11 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB40_9 -; RV64ZVE32F-NEXT: .LBB40_8: # %cond.load13 +; RV64ZVE32F-NEXT: beqz a2, .LBB40_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 48 ; RV64ZVE32F-NEXT: srli a2, a2, 46 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -3265,12 +3219,12 @@ define <8 x i32> @mgather_baseidx_zext_v8i16_v8i32(ptr %base, <8 x i16> %idxs, < ; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 -; RV64ZVE32F-NEXT: .LBB40_9: # %else14 +; RV64ZVE32F-NEXT: .LBB40_12: # %else14 ; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB40_11 -; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB40_14 +; RV64ZVE32F-NEXT: # %bb.13: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 48 ; RV64ZVE32F-NEXT: srli a2, a2, 46 @@ -3280,10 +3234,10 @@ define <8 x i32> @mgather_baseidx_zext_v8i16_v8i32(ptr %base, <8 x i16> %idxs, < ; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 -; RV64ZVE32F-NEXT: .LBB40_11: # %else17 +; RV64ZVE32F-NEXT: .LBB40_14: # %else17 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB40_13 -; RV64ZVE32F-NEXT: # %bb.12: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a1, .LBB40_16 +; RV64ZVE32F-NEXT: # %bb.15: # %cond.load19 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v8 @@ -3295,50 +3249,10 @@ define <8 x i32> @mgather_baseidx_zext_v8i16_v8i32(ptr %base, <8 x i16> %idxs, < ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 7 -; RV64ZVE32F-NEXT: .LBB40_13: # %else20 +; RV64ZVE32F-NEXT: .LBB40_16: # %else20 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vmv2r.v v8, v10 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB40_14: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: slli a2, a2, 48 -; RV64ZVE32F-NEXT: srli a2, a2, 46 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB40_6 -; RV64ZVE32F-NEXT: .LBB40_15: # %cond.load7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: slli a2, a2, 48 -; RV64ZVE32F-NEXT: srli a2, a2, 46 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v8, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 3 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB40_7 -; RV64ZVE32F-NEXT: .LBB40_16: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 48 -; RV64ZVE32F-NEXT: srli a2, a2, 46 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB40_8 -; RV64ZVE32F-NEXT: j .LBB40_9 %eidxs = zext <8 x i16> %idxs to <8 x i32> %ptrs = getelementptr inbounds i32, ptr %base, <8 x i32> %eidxs %v = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> %m, <8 x i32> %passthru) @@ -3389,37 +3303,45 @@ define <8 x i32> @mgather_baseidx_v8i32(ptr %base, <8 x i32> %idxs, <8 x i1> %m, ; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 ; RV64ZVE32F-NEXT: .LBB41_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB41_14 -; RV64ZVE32F-NEXT: # %bb.5: # %else5 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB41_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v13, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v13, 2 +; RV64ZVE32F-NEXT: .LBB41_6: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB41_15 -; RV64ZVE32F-NEXT: .LBB41_6: # %else8 +; RV64ZVE32F-NEXT: # %bb.7: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: bnez a2, .LBB41_16 -; RV64ZVE32F-NEXT: .LBB41_7: # %else11 +; RV64ZVE32F-NEXT: .LBB41_8: # %else11 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB41_9 -; RV64ZVE32F-NEXT: .LBB41_8: # %cond.load13 +; RV64ZVE32F-NEXT: beqz a2, .LBB41_10 +; RV64ZVE32F-NEXT: .LBB41_9: # %cond.load13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v12, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 5 -; RV64ZVE32F-NEXT: .LBB41_9: # %else14 +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 +; RV64ZVE32F-NEXT: .LBB41_10: # %else14 ; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v12, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB41_11 -; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB41_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -3427,10 +3349,10 @@ define <8 x i32> @mgather_baseidx_v8i32(ptr %base, <8 x i32> %idxs, <8 x i1> %m, ; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 -; RV64ZVE32F-NEXT: .LBB41_11: # %else17 +; RV64ZVE32F-NEXT: .LBB41_12: # %else17 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB41_13 -; RV64ZVE32F-NEXT: # %bb.12: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a1, .LBB41_14 +; RV64ZVE32F-NEXT: # %bb.13: # %cond.load19 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v8 @@ -3440,42 +3362,32 @@ define <8 x i32> @mgather_baseidx_v8i32(ptr %base, <8 x i32> %idxs, <8 x i1> %m, ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 7 -; RV64ZVE32F-NEXT: .LBB41_13: # %else20 +; RV64ZVE32F-NEXT: .LBB41_14: # %else20 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vmv2r.v v8, v10 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB41_14: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v9, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 2 -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB41_6 ; RV64ZVE32F-NEXT: .LBB41_15: # %cond.load7 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v8, a2 -; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 3 +; RV64ZVE32F-NEXT: vmv.s.x v9, a2 +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 3 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB41_7 +; RV64ZVE32F-NEXT: beqz a2, .LBB41_8 ; RV64ZVE32F-NEXT: .LBB41_16: # %cond.load10 ; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v8, a2 -; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 4 +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB41_8 -; RV64ZVE32F-NEXT: j .LBB41_9 +; RV64ZVE32F-NEXT: bnez a2, .LBB41_9 +; RV64ZVE32F-NEXT: j .LBB41_10 %ptrs = getelementptr inbounds i32, ptr %base, <8 x i32> %idxs %v = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> %m, <8 x i32> %passthru) ret <8 x i32> %v @@ -4270,80 +4182,82 @@ define <8 x i64> @mgather_baseidx_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 x i1> ; RV64ZVE32F-NEXT: add a4, a1, a4 ; RV64ZVE32F-NEXT: ld a4, 0(a4) ; RV64ZVE32F-NEXT: .LBB48_5: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: andi a6, a5, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: beqz a6, .LBB48_10 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a6, .LBB48_8 ; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a6, v8 +; RV64ZVE32F-NEXT: vmv.x.s a6, v9 ; RV64ZVE32F-NEXT: slli a6, a6, 3 ; RV64ZVE32F-NEXT: add a6, a1, a6 ; RV64ZVE32F-NEXT: ld a6, 0(a6) ; RV64ZVE32F-NEXT: andi a7, a5, 8 -; RV64ZVE32F-NEXT: bnez a7, .LBB48_11 +; RV64ZVE32F-NEXT: bnez a7, .LBB48_9 ; RV64ZVE32F-NEXT: .LBB48_7: ; RV64ZVE32F-NEXT: ld a7, 24(a2) -; RV64ZVE32F-NEXT: andi t0, a5, 16 -; RV64ZVE32F-NEXT: bnez t0, .LBB48_12 +; RV64ZVE32F-NEXT: j .LBB48_10 ; RV64ZVE32F-NEXT: .LBB48_8: -; RV64ZVE32F-NEXT: ld t0, 32(a2) -; RV64ZVE32F-NEXT: andi t1, a5, 32 -; RV64ZVE32F-NEXT: bnez t1, .LBB48_13 -; RV64ZVE32F-NEXT: .LBB48_9: -; RV64ZVE32F-NEXT: ld t1, 40(a2) -; RV64ZVE32F-NEXT: j .LBB48_14 -; RV64ZVE32F-NEXT: .LBB48_10: ; RV64ZVE32F-NEXT: ld a6, 16(a2) ; RV64ZVE32F-NEXT: andi a7, a5, 8 ; RV64ZVE32F-NEXT: beqz a7, .LBB48_7 -; RV64ZVE32F-NEXT: .LBB48_11: # %cond.load7 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a7, v8 +; RV64ZVE32F-NEXT: .LBB48_9: # %cond.load7 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a7, v9 ; RV64ZVE32F-NEXT: slli a7, a7, 3 ; RV64ZVE32F-NEXT: add a7, a1, a7 ; RV64ZVE32F-NEXT: ld a7, 0(a7) +; RV64ZVE32F-NEXT: .LBB48_10: # %else8 ; RV64ZVE32F-NEXT: andi t0, a5, 16 -; RV64ZVE32F-NEXT: beqz t0, .LBB48_8 -; RV64ZVE32F-NEXT: .LBB48_12: # %cond.load10 -; RV64ZVE32F-NEXT: vmv.x.s t0, v9 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz t0, .LBB48_13 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load10 +; RV64ZVE32F-NEXT: vmv.x.s t0, v8 ; RV64ZVE32F-NEXT: slli t0, t0, 3 ; RV64ZVE32F-NEXT: add t0, a1, t0 ; RV64ZVE32F-NEXT: ld t0, 0(t0) ; RV64ZVE32F-NEXT: andi t1, a5, 32 -; RV64ZVE32F-NEXT: beqz t1, .LBB48_9 -; RV64ZVE32F-NEXT: .LBB48_13: # %cond.load13 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s t1, v8 +; RV64ZVE32F-NEXT: bnez t1, .LBB48_14 +; RV64ZVE32F-NEXT: .LBB48_12: +; RV64ZVE32F-NEXT: ld t1, 40(a2) +; RV64ZVE32F-NEXT: j .LBB48_15 +; RV64ZVE32F-NEXT: .LBB48_13: +; RV64ZVE32F-NEXT: ld t0, 32(a2) +; RV64ZVE32F-NEXT: andi t1, a5, 32 +; RV64ZVE32F-NEXT: beqz t1, .LBB48_12 +; RV64ZVE32F-NEXT: .LBB48_14: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s t1, v9 ; RV64ZVE32F-NEXT: slli t1, t1, 3 ; RV64ZVE32F-NEXT: add t1, a1, t1 ; RV64ZVE32F-NEXT: ld t1, 0(t1) -; RV64ZVE32F-NEXT: .LBB48_14: # %else14 +; RV64ZVE32F-NEXT: .LBB48_15: # %else14 ; RV64ZVE32F-NEXT: andi t2, a5, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 -; RV64ZVE32F-NEXT: beqz t2, .LBB48_17 -; RV64ZVE32F-NEXT: # %bb.15: # %cond.load16 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz t2, .LBB48_18 +; RV64ZVE32F-NEXT: # %bb.16: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s t2, v8 ; RV64ZVE32F-NEXT: slli t2, t2, 3 ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: ld t2, 0(t2) ; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: bnez a5, .LBB48_18 -; RV64ZVE32F-NEXT: .LBB48_16: -; RV64ZVE32F-NEXT: ld a1, 56(a2) -; RV64ZVE32F-NEXT: j .LBB48_19 +; RV64ZVE32F-NEXT: bnez a5, .LBB48_19 ; RV64ZVE32F-NEXT: .LBB48_17: +; RV64ZVE32F-NEXT: ld a1, 56(a2) +; RV64ZVE32F-NEXT: j .LBB48_20 +; RV64ZVE32F-NEXT: .LBB48_18: ; RV64ZVE32F-NEXT: ld t2, 48(a2) ; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: beqz a5, .LBB48_16 -; RV64ZVE32F-NEXT: .LBB48_18: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a5, .LBB48_17 +; RV64ZVE32F-NEXT: .LBB48_19: # %cond.load19 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: ld a1, 0(a1) -; RV64ZVE32F-NEXT: .LBB48_19: # %else20 +; RV64ZVE32F-NEXT: .LBB48_20: # %else20 ; RV64ZVE32F-NEXT: sd a3, 0(a0) ; RV64ZVE32F-NEXT: sd a4, 8(a0) ; RV64ZVE32F-NEXT: sd a6, 16(a0) @@ -4547,80 +4461,82 @@ define <8 x i64> @mgather_baseidx_sext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: add a4, a1, a4 ; RV64ZVE32F-NEXT: ld a4, 0(a4) ; RV64ZVE32F-NEXT: .LBB49_5: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: andi a6, a5, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: beqz a6, .LBB49_10 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a6, .LBB49_8 ; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a6, v8 +; RV64ZVE32F-NEXT: vmv.x.s a6, v9 ; RV64ZVE32F-NEXT: slli a6, a6, 3 ; RV64ZVE32F-NEXT: add a6, a1, a6 ; RV64ZVE32F-NEXT: ld a6, 0(a6) ; RV64ZVE32F-NEXT: andi a7, a5, 8 -; RV64ZVE32F-NEXT: bnez a7, .LBB49_11 +; RV64ZVE32F-NEXT: bnez a7, .LBB49_9 ; RV64ZVE32F-NEXT: .LBB49_7: ; RV64ZVE32F-NEXT: ld a7, 24(a2) -; RV64ZVE32F-NEXT: andi t0, a5, 16 -; RV64ZVE32F-NEXT: bnez t0, .LBB49_12 +; RV64ZVE32F-NEXT: j .LBB49_10 ; RV64ZVE32F-NEXT: .LBB49_8: -; RV64ZVE32F-NEXT: ld t0, 32(a2) -; RV64ZVE32F-NEXT: andi t1, a5, 32 -; RV64ZVE32F-NEXT: bnez t1, .LBB49_13 -; RV64ZVE32F-NEXT: .LBB49_9: -; RV64ZVE32F-NEXT: ld t1, 40(a2) -; RV64ZVE32F-NEXT: j .LBB49_14 -; RV64ZVE32F-NEXT: .LBB49_10: ; RV64ZVE32F-NEXT: ld a6, 16(a2) ; RV64ZVE32F-NEXT: andi a7, a5, 8 ; RV64ZVE32F-NEXT: beqz a7, .LBB49_7 -; RV64ZVE32F-NEXT: .LBB49_11: # %cond.load7 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a7, v8 +; RV64ZVE32F-NEXT: .LBB49_9: # %cond.load7 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a7, v9 ; RV64ZVE32F-NEXT: slli a7, a7, 3 ; RV64ZVE32F-NEXT: add a7, a1, a7 ; RV64ZVE32F-NEXT: ld a7, 0(a7) +; RV64ZVE32F-NEXT: .LBB49_10: # %else8 ; RV64ZVE32F-NEXT: andi t0, a5, 16 -; RV64ZVE32F-NEXT: beqz t0, .LBB49_8 -; RV64ZVE32F-NEXT: .LBB49_12: # %cond.load10 -; RV64ZVE32F-NEXT: vmv.x.s t0, v9 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz t0, .LBB49_13 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load10 +; RV64ZVE32F-NEXT: vmv.x.s t0, v8 ; RV64ZVE32F-NEXT: slli t0, t0, 3 ; RV64ZVE32F-NEXT: add t0, a1, t0 ; RV64ZVE32F-NEXT: ld t0, 0(t0) ; RV64ZVE32F-NEXT: andi t1, a5, 32 -; RV64ZVE32F-NEXT: beqz t1, .LBB49_9 -; RV64ZVE32F-NEXT: .LBB49_13: # %cond.load13 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s t1, v8 +; RV64ZVE32F-NEXT: bnez t1, .LBB49_14 +; RV64ZVE32F-NEXT: .LBB49_12: +; RV64ZVE32F-NEXT: ld t1, 40(a2) +; RV64ZVE32F-NEXT: j .LBB49_15 +; RV64ZVE32F-NEXT: .LBB49_13: +; RV64ZVE32F-NEXT: ld t0, 32(a2) +; RV64ZVE32F-NEXT: andi t1, a5, 32 +; RV64ZVE32F-NEXT: beqz t1, .LBB49_12 +; RV64ZVE32F-NEXT: .LBB49_14: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s t1, v9 ; RV64ZVE32F-NEXT: slli t1, t1, 3 ; RV64ZVE32F-NEXT: add t1, a1, t1 ; RV64ZVE32F-NEXT: ld t1, 0(t1) -; RV64ZVE32F-NEXT: .LBB49_14: # %else14 +; RV64ZVE32F-NEXT: .LBB49_15: # %else14 ; RV64ZVE32F-NEXT: andi t2, a5, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 -; RV64ZVE32F-NEXT: beqz t2, .LBB49_17 -; RV64ZVE32F-NEXT: # %bb.15: # %cond.load16 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz t2, .LBB49_18 +; RV64ZVE32F-NEXT: # %bb.16: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s t2, v8 ; RV64ZVE32F-NEXT: slli t2, t2, 3 ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: ld t2, 0(t2) ; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: bnez a5, .LBB49_18 -; RV64ZVE32F-NEXT: .LBB49_16: -; RV64ZVE32F-NEXT: ld a1, 56(a2) -; RV64ZVE32F-NEXT: j .LBB49_19 +; RV64ZVE32F-NEXT: bnez a5, .LBB49_19 ; RV64ZVE32F-NEXT: .LBB49_17: +; RV64ZVE32F-NEXT: ld a1, 56(a2) +; RV64ZVE32F-NEXT: j .LBB49_20 +; RV64ZVE32F-NEXT: .LBB49_18: ; RV64ZVE32F-NEXT: ld t2, 48(a2) ; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: beqz a5, .LBB49_16 -; RV64ZVE32F-NEXT: .LBB49_18: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a5, .LBB49_17 +; RV64ZVE32F-NEXT: .LBB49_19: # %cond.load19 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: ld a1, 0(a1) -; RV64ZVE32F-NEXT: .LBB49_19: # %else20 +; RV64ZVE32F-NEXT: .LBB49_20: # %else20 ; RV64ZVE32F-NEXT: sd a3, 0(a0) ; RV64ZVE32F-NEXT: sd a4, 8(a0) ; RV64ZVE32F-NEXT: sd a6, 16(a0) @@ -4828,86 +4744,88 @@ define <8 x i64> @mgather_baseidx_zext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: add a4, a1, a4 ; RV64ZVE32F-NEXT: ld a4, 0(a4) ; RV64ZVE32F-NEXT: .LBB50_5: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: andi a6, a5, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: beqz a6, .LBB50_10 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a6, .LBB50_8 ; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a6, v8 +; RV64ZVE32F-NEXT: vmv.x.s a6, v9 ; RV64ZVE32F-NEXT: zext.b a6, a6 ; RV64ZVE32F-NEXT: slli a6, a6, 3 ; RV64ZVE32F-NEXT: add a6, a1, a6 ; RV64ZVE32F-NEXT: ld a6, 0(a6) ; RV64ZVE32F-NEXT: andi a7, a5, 8 -; RV64ZVE32F-NEXT: bnez a7, .LBB50_11 +; RV64ZVE32F-NEXT: bnez a7, .LBB50_9 ; RV64ZVE32F-NEXT: .LBB50_7: ; RV64ZVE32F-NEXT: ld a7, 24(a2) -; RV64ZVE32F-NEXT: andi t0, a5, 16 -; RV64ZVE32F-NEXT: bnez t0, .LBB50_12 +; RV64ZVE32F-NEXT: j .LBB50_10 ; RV64ZVE32F-NEXT: .LBB50_8: -; RV64ZVE32F-NEXT: ld t0, 32(a2) -; RV64ZVE32F-NEXT: andi t1, a5, 32 -; RV64ZVE32F-NEXT: bnez t1, .LBB50_13 -; RV64ZVE32F-NEXT: .LBB50_9: -; RV64ZVE32F-NEXT: ld t1, 40(a2) -; RV64ZVE32F-NEXT: j .LBB50_14 -; RV64ZVE32F-NEXT: .LBB50_10: ; RV64ZVE32F-NEXT: ld a6, 16(a2) ; RV64ZVE32F-NEXT: andi a7, a5, 8 ; RV64ZVE32F-NEXT: beqz a7, .LBB50_7 -; RV64ZVE32F-NEXT: .LBB50_11: # %cond.load7 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a7, v8 +; RV64ZVE32F-NEXT: .LBB50_9: # %cond.load7 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a7, v9 ; RV64ZVE32F-NEXT: zext.b a7, a7 ; RV64ZVE32F-NEXT: slli a7, a7, 3 ; RV64ZVE32F-NEXT: add a7, a1, a7 ; RV64ZVE32F-NEXT: ld a7, 0(a7) +; RV64ZVE32F-NEXT: .LBB50_10: # %else8 ; RV64ZVE32F-NEXT: andi t0, a5, 16 -; RV64ZVE32F-NEXT: beqz t0, .LBB50_8 -; RV64ZVE32F-NEXT: .LBB50_12: # %cond.load10 -; RV64ZVE32F-NEXT: vmv.x.s t0, v9 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz t0, .LBB50_13 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load10 +; RV64ZVE32F-NEXT: vmv.x.s t0, v8 ; RV64ZVE32F-NEXT: zext.b t0, t0 ; RV64ZVE32F-NEXT: slli t0, t0, 3 ; RV64ZVE32F-NEXT: add t0, a1, t0 ; RV64ZVE32F-NEXT: ld t0, 0(t0) ; RV64ZVE32F-NEXT: andi t1, a5, 32 -; RV64ZVE32F-NEXT: beqz t1, .LBB50_9 -; RV64ZVE32F-NEXT: .LBB50_13: # %cond.load13 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s t1, v8 +; RV64ZVE32F-NEXT: bnez t1, .LBB50_14 +; RV64ZVE32F-NEXT: .LBB50_12: +; RV64ZVE32F-NEXT: ld t1, 40(a2) +; RV64ZVE32F-NEXT: j .LBB50_15 +; RV64ZVE32F-NEXT: .LBB50_13: +; RV64ZVE32F-NEXT: ld t0, 32(a2) +; RV64ZVE32F-NEXT: andi t1, a5, 32 +; RV64ZVE32F-NEXT: beqz t1, .LBB50_12 +; RV64ZVE32F-NEXT: .LBB50_14: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s t1, v9 ; RV64ZVE32F-NEXT: zext.b t1, t1 ; RV64ZVE32F-NEXT: slli t1, t1, 3 ; RV64ZVE32F-NEXT: add t1, a1, t1 ; RV64ZVE32F-NEXT: ld t1, 0(t1) -; RV64ZVE32F-NEXT: .LBB50_14: # %else14 +; RV64ZVE32F-NEXT: .LBB50_15: # %else14 ; RV64ZVE32F-NEXT: andi t2, a5, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 -; RV64ZVE32F-NEXT: beqz t2, .LBB50_17 -; RV64ZVE32F-NEXT: # %bb.15: # %cond.load16 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz t2, .LBB50_18 +; RV64ZVE32F-NEXT: # %bb.16: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s t2, v8 ; RV64ZVE32F-NEXT: zext.b t2, t2 ; RV64ZVE32F-NEXT: slli t2, t2, 3 ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: ld t2, 0(t2) ; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: bnez a5, .LBB50_18 -; RV64ZVE32F-NEXT: .LBB50_16: -; RV64ZVE32F-NEXT: ld a1, 56(a2) -; RV64ZVE32F-NEXT: j .LBB50_19 +; RV64ZVE32F-NEXT: bnez a5, .LBB50_19 ; RV64ZVE32F-NEXT: .LBB50_17: +; RV64ZVE32F-NEXT: ld a1, 56(a2) +; RV64ZVE32F-NEXT: j .LBB50_20 +; RV64ZVE32F-NEXT: .LBB50_18: ; RV64ZVE32F-NEXT: ld t2, 48(a2) ; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: beqz a5, .LBB50_16 -; RV64ZVE32F-NEXT: .LBB50_18: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a5, .LBB50_17 +; RV64ZVE32F-NEXT: .LBB50_19: # %cond.load19 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: zext.b a2, a2 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: ld a1, 0(a1) -; RV64ZVE32F-NEXT: .LBB50_19: # %else20 +; RV64ZVE32F-NEXT: .LBB50_20: # %else20 ; RV64ZVE32F-NEXT: sd a3, 0(a0) ; RV64ZVE32F-NEXT: sd a4, 8(a0) ; RV64ZVE32F-NEXT: sd a6, 16(a0) @@ -5114,80 +5032,82 @@ define <8 x i64> @mgather_baseidx_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i ; RV64ZVE32F-NEXT: add a4, a1, a4 ; RV64ZVE32F-NEXT: ld a4, 0(a4) ; RV64ZVE32F-NEXT: .LBB51_5: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: andi a6, a5, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: beqz a6, .LBB51_10 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a6, .LBB51_8 ; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a6, v8 +; RV64ZVE32F-NEXT: vmv.x.s a6, v9 ; RV64ZVE32F-NEXT: slli a6, a6, 3 ; RV64ZVE32F-NEXT: add a6, a1, a6 ; RV64ZVE32F-NEXT: ld a6, 0(a6) ; RV64ZVE32F-NEXT: andi a7, a5, 8 -; RV64ZVE32F-NEXT: bnez a7, .LBB51_11 +; RV64ZVE32F-NEXT: bnez a7, .LBB51_9 ; RV64ZVE32F-NEXT: .LBB51_7: ; RV64ZVE32F-NEXT: ld a7, 24(a2) -; RV64ZVE32F-NEXT: andi t0, a5, 16 -; RV64ZVE32F-NEXT: bnez t0, .LBB51_12 +; RV64ZVE32F-NEXT: j .LBB51_10 ; RV64ZVE32F-NEXT: .LBB51_8: -; RV64ZVE32F-NEXT: ld t0, 32(a2) -; RV64ZVE32F-NEXT: andi t1, a5, 32 -; RV64ZVE32F-NEXT: bnez t1, .LBB51_13 -; RV64ZVE32F-NEXT: .LBB51_9: -; RV64ZVE32F-NEXT: ld t1, 40(a2) -; RV64ZVE32F-NEXT: j .LBB51_14 -; RV64ZVE32F-NEXT: .LBB51_10: ; RV64ZVE32F-NEXT: ld a6, 16(a2) ; RV64ZVE32F-NEXT: andi a7, a5, 8 ; RV64ZVE32F-NEXT: beqz a7, .LBB51_7 -; RV64ZVE32F-NEXT: .LBB51_11: # %cond.load7 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a7, v8 +; RV64ZVE32F-NEXT: .LBB51_9: # %cond.load7 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a7, v9 ; RV64ZVE32F-NEXT: slli a7, a7, 3 ; RV64ZVE32F-NEXT: add a7, a1, a7 ; RV64ZVE32F-NEXT: ld a7, 0(a7) +; RV64ZVE32F-NEXT: .LBB51_10: # %else8 ; RV64ZVE32F-NEXT: andi t0, a5, 16 -; RV64ZVE32F-NEXT: beqz t0, .LBB51_8 -; RV64ZVE32F-NEXT: .LBB51_12: # %cond.load10 -; RV64ZVE32F-NEXT: vmv.x.s t0, v9 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz t0, .LBB51_13 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load10 +; RV64ZVE32F-NEXT: vmv.x.s t0, v8 ; RV64ZVE32F-NEXT: slli t0, t0, 3 ; RV64ZVE32F-NEXT: add t0, a1, t0 ; RV64ZVE32F-NEXT: ld t0, 0(t0) ; RV64ZVE32F-NEXT: andi t1, a5, 32 -; RV64ZVE32F-NEXT: beqz t1, .LBB51_9 -; RV64ZVE32F-NEXT: .LBB51_13: # %cond.load13 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s t1, v8 +; RV64ZVE32F-NEXT: bnez t1, .LBB51_14 +; RV64ZVE32F-NEXT: .LBB51_12: +; RV64ZVE32F-NEXT: ld t1, 40(a2) +; RV64ZVE32F-NEXT: j .LBB51_15 +; RV64ZVE32F-NEXT: .LBB51_13: +; RV64ZVE32F-NEXT: ld t0, 32(a2) +; RV64ZVE32F-NEXT: andi t1, a5, 32 +; RV64ZVE32F-NEXT: beqz t1, .LBB51_12 +; RV64ZVE32F-NEXT: .LBB51_14: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s t1, v9 ; RV64ZVE32F-NEXT: slli t1, t1, 3 ; RV64ZVE32F-NEXT: add t1, a1, t1 ; RV64ZVE32F-NEXT: ld t1, 0(t1) -; RV64ZVE32F-NEXT: .LBB51_14: # %else14 +; RV64ZVE32F-NEXT: .LBB51_15: # %else14 ; RV64ZVE32F-NEXT: andi t2, a5, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 -; RV64ZVE32F-NEXT: beqz t2, .LBB51_17 -; RV64ZVE32F-NEXT: # %bb.15: # %cond.load16 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz t2, .LBB51_18 +; RV64ZVE32F-NEXT: # %bb.16: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s t2, v8 ; RV64ZVE32F-NEXT: slli t2, t2, 3 ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: ld t2, 0(t2) ; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: bnez a5, .LBB51_18 -; RV64ZVE32F-NEXT: .LBB51_16: -; RV64ZVE32F-NEXT: ld a1, 56(a2) -; RV64ZVE32F-NEXT: j .LBB51_19 +; RV64ZVE32F-NEXT: bnez a5, .LBB51_19 ; RV64ZVE32F-NEXT: .LBB51_17: +; RV64ZVE32F-NEXT: ld a1, 56(a2) +; RV64ZVE32F-NEXT: j .LBB51_20 +; RV64ZVE32F-NEXT: .LBB51_18: ; RV64ZVE32F-NEXT: ld t2, 48(a2) ; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: beqz a5, .LBB51_16 -; RV64ZVE32F-NEXT: .LBB51_18: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a5, .LBB51_17 +; RV64ZVE32F-NEXT: .LBB51_19: # %cond.load19 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: ld a1, 0(a1) -; RV64ZVE32F-NEXT: .LBB51_19: # %else20 +; RV64ZVE32F-NEXT: .LBB51_20: # %else20 ; RV64ZVE32F-NEXT: sd a3, 0(a0) ; RV64ZVE32F-NEXT: sd a4, 8(a0) ; RV64ZVE32F-NEXT: sd a6, 16(a0) @@ -5393,80 +5313,82 @@ define <8 x i64> @mgather_baseidx_sext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, < ; RV64ZVE32F-NEXT: add a4, a1, a4 ; RV64ZVE32F-NEXT: ld a4, 0(a4) ; RV64ZVE32F-NEXT: .LBB52_5: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: andi a6, a5, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: beqz a6, .LBB52_10 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a6, .LBB52_8 ; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a6, v8 +; RV64ZVE32F-NEXT: vmv.x.s a6, v9 ; RV64ZVE32F-NEXT: slli a6, a6, 3 ; RV64ZVE32F-NEXT: add a6, a1, a6 ; RV64ZVE32F-NEXT: ld a6, 0(a6) ; RV64ZVE32F-NEXT: andi a7, a5, 8 -; RV64ZVE32F-NEXT: bnez a7, .LBB52_11 +; RV64ZVE32F-NEXT: bnez a7, .LBB52_9 ; RV64ZVE32F-NEXT: .LBB52_7: ; RV64ZVE32F-NEXT: ld a7, 24(a2) -; RV64ZVE32F-NEXT: andi t0, a5, 16 -; RV64ZVE32F-NEXT: bnez t0, .LBB52_12 +; RV64ZVE32F-NEXT: j .LBB52_10 ; RV64ZVE32F-NEXT: .LBB52_8: -; RV64ZVE32F-NEXT: ld t0, 32(a2) -; RV64ZVE32F-NEXT: andi t1, a5, 32 -; RV64ZVE32F-NEXT: bnez t1, .LBB52_13 -; RV64ZVE32F-NEXT: .LBB52_9: -; RV64ZVE32F-NEXT: ld t1, 40(a2) -; RV64ZVE32F-NEXT: j .LBB52_14 -; RV64ZVE32F-NEXT: .LBB52_10: ; RV64ZVE32F-NEXT: ld a6, 16(a2) ; RV64ZVE32F-NEXT: andi a7, a5, 8 ; RV64ZVE32F-NEXT: beqz a7, .LBB52_7 -; RV64ZVE32F-NEXT: .LBB52_11: # %cond.load7 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a7, v8 +; RV64ZVE32F-NEXT: .LBB52_9: # %cond.load7 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a7, v9 ; RV64ZVE32F-NEXT: slli a7, a7, 3 ; RV64ZVE32F-NEXT: add a7, a1, a7 ; RV64ZVE32F-NEXT: ld a7, 0(a7) +; RV64ZVE32F-NEXT: .LBB52_10: # %else8 ; RV64ZVE32F-NEXT: andi t0, a5, 16 -; RV64ZVE32F-NEXT: beqz t0, .LBB52_8 -; RV64ZVE32F-NEXT: .LBB52_12: # %cond.load10 -; RV64ZVE32F-NEXT: vmv.x.s t0, v9 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz t0, .LBB52_13 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load10 +; RV64ZVE32F-NEXT: vmv.x.s t0, v8 ; RV64ZVE32F-NEXT: slli t0, t0, 3 ; RV64ZVE32F-NEXT: add t0, a1, t0 ; RV64ZVE32F-NEXT: ld t0, 0(t0) ; RV64ZVE32F-NEXT: andi t1, a5, 32 -; RV64ZVE32F-NEXT: beqz t1, .LBB52_9 -; RV64ZVE32F-NEXT: .LBB52_13: # %cond.load13 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s t1, v8 +; RV64ZVE32F-NEXT: bnez t1, .LBB52_14 +; RV64ZVE32F-NEXT: .LBB52_12: +; RV64ZVE32F-NEXT: ld t1, 40(a2) +; RV64ZVE32F-NEXT: j .LBB52_15 +; RV64ZVE32F-NEXT: .LBB52_13: +; RV64ZVE32F-NEXT: ld t0, 32(a2) +; RV64ZVE32F-NEXT: andi t1, a5, 32 +; RV64ZVE32F-NEXT: beqz t1, .LBB52_12 +; RV64ZVE32F-NEXT: .LBB52_14: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s t1, v9 ; RV64ZVE32F-NEXT: slli t1, t1, 3 ; RV64ZVE32F-NEXT: add t1, a1, t1 ; RV64ZVE32F-NEXT: ld t1, 0(t1) -; RV64ZVE32F-NEXT: .LBB52_14: # %else14 +; RV64ZVE32F-NEXT: .LBB52_15: # %else14 ; RV64ZVE32F-NEXT: andi t2, a5, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 -; RV64ZVE32F-NEXT: beqz t2, .LBB52_17 -; RV64ZVE32F-NEXT: # %bb.15: # %cond.load16 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz t2, .LBB52_18 +; RV64ZVE32F-NEXT: # %bb.16: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s t2, v8 ; RV64ZVE32F-NEXT: slli t2, t2, 3 ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: ld t2, 0(t2) ; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: bnez a5, .LBB52_18 -; RV64ZVE32F-NEXT: .LBB52_16: -; RV64ZVE32F-NEXT: ld a1, 56(a2) -; RV64ZVE32F-NEXT: j .LBB52_19 +; RV64ZVE32F-NEXT: bnez a5, .LBB52_19 ; RV64ZVE32F-NEXT: .LBB52_17: +; RV64ZVE32F-NEXT: ld a1, 56(a2) +; RV64ZVE32F-NEXT: j .LBB52_20 +; RV64ZVE32F-NEXT: .LBB52_18: ; RV64ZVE32F-NEXT: ld t2, 48(a2) ; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: beqz a5, .LBB52_16 -; RV64ZVE32F-NEXT: .LBB52_18: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a5, .LBB52_17 +; RV64ZVE32F-NEXT: .LBB52_19: # %cond.load19 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: ld a1, 0(a1) -; RV64ZVE32F-NEXT: .LBB52_19: # %else20 +; RV64ZVE32F-NEXT: .LBB52_20: # %else20 ; RV64ZVE32F-NEXT: sd a3, 0(a0) ; RV64ZVE32F-NEXT: sd a4, 8(a0) ; RV64ZVE32F-NEXT: sd a6, 16(a0) @@ -5676,86 +5598,88 @@ define <8 x i64> @mgather_baseidx_zext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, < ; RV64ZVE32F-NEXT: add a4, a1, a4 ; RV64ZVE32F-NEXT: ld a4, 0(a4) ; RV64ZVE32F-NEXT: .LBB53_5: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: andi a6, a5, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: beqz a6, .LBB53_10 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a6, .LBB53_8 ; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a6, v8 +; RV64ZVE32F-NEXT: vmv.x.s a6, v9 ; RV64ZVE32F-NEXT: slli a6, a6, 48 ; RV64ZVE32F-NEXT: srli a6, a6, 45 ; RV64ZVE32F-NEXT: add a6, a1, a6 ; RV64ZVE32F-NEXT: ld a6, 0(a6) ; RV64ZVE32F-NEXT: andi a7, a5, 8 -; RV64ZVE32F-NEXT: bnez a7, .LBB53_11 +; RV64ZVE32F-NEXT: bnez a7, .LBB53_9 ; RV64ZVE32F-NEXT: .LBB53_7: ; RV64ZVE32F-NEXT: ld a7, 24(a2) -; RV64ZVE32F-NEXT: andi t0, a5, 16 -; RV64ZVE32F-NEXT: bnez t0, .LBB53_12 +; RV64ZVE32F-NEXT: j .LBB53_10 ; RV64ZVE32F-NEXT: .LBB53_8: -; RV64ZVE32F-NEXT: ld t0, 32(a2) -; RV64ZVE32F-NEXT: andi t1, a5, 32 -; RV64ZVE32F-NEXT: bnez t1, .LBB53_13 -; RV64ZVE32F-NEXT: .LBB53_9: -; RV64ZVE32F-NEXT: ld t1, 40(a2) -; RV64ZVE32F-NEXT: j .LBB53_14 -; RV64ZVE32F-NEXT: .LBB53_10: ; RV64ZVE32F-NEXT: ld a6, 16(a2) ; RV64ZVE32F-NEXT: andi a7, a5, 8 ; RV64ZVE32F-NEXT: beqz a7, .LBB53_7 -; RV64ZVE32F-NEXT: .LBB53_11: # %cond.load7 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a7, v8 +; RV64ZVE32F-NEXT: .LBB53_9: # %cond.load7 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a7, v9 ; RV64ZVE32F-NEXT: slli a7, a7, 48 ; RV64ZVE32F-NEXT: srli a7, a7, 45 ; RV64ZVE32F-NEXT: add a7, a1, a7 ; RV64ZVE32F-NEXT: ld a7, 0(a7) +; RV64ZVE32F-NEXT: .LBB53_10: # %else8 ; RV64ZVE32F-NEXT: andi t0, a5, 16 -; RV64ZVE32F-NEXT: beqz t0, .LBB53_8 -; RV64ZVE32F-NEXT: .LBB53_12: # %cond.load10 -; RV64ZVE32F-NEXT: vmv.x.s t0, v9 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz t0, .LBB53_13 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load10 +; RV64ZVE32F-NEXT: vmv.x.s t0, v8 ; RV64ZVE32F-NEXT: slli t0, t0, 48 ; RV64ZVE32F-NEXT: srli t0, t0, 45 ; RV64ZVE32F-NEXT: add t0, a1, t0 ; RV64ZVE32F-NEXT: ld t0, 0(t0) ; RV64ZVE32F-NEXT: andi t1, a5, 32 -; RV64ZVE32F-NEXT: beqz t1, .LBB53_9 -; RV64ZVE32F-NEXT: .LBB53_13: # %cond.load13 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s t1, v8 +; RV64ZVE32F-NEXT: bnez t1, .LBB53_14 +; RV64ZVE32F-NEXT: .LBB53_12: +; RV64ZVE32F-NEXT: ld t1, 40(a2) +; RV64ZVE32F-NEXT: j .LBB53_15 +; RV64ZVE32F-NEXT: .LBB53_13: +; RV64ZVE32F-NEXT: ld t0, 32(a2) +; RV64ZVE32F-NEXT: andi t1, a5, 32 +; RV64ZVE32F-NEXT: beqz t1, .LBB53_12 +; RV64ZVE32F-NEXT: .LBB53_14: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s t1, v9 ; RV64ZVE32F-NEXT: slli t1, t1, 48 ; RV64ZVE32F-NEXT: srli t1, t1, 45 ; RV64ZVE32F-NEXT: add t1, a1, t1 ; RV64ZVE32F-NEXT: ld t1, 0(t1) -; RV64ZVE32F-NEXT: .LBB53_14: # %else14 +; RV64ZVE32F-NEXT: .LBB53_15: # %else14 ; RV64ZVE32F-NEXT: andi t2, a5, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 -; RV64ZVE32F-NEXT: beqz t2, .LBB53_17 -; RV64ZVE32F-NEXT: # %bb.15: # %cond.load16 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz t2, .LBB53_18 +; RV64ZVE32F-NEXT: # %bb.16: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s t2, v8 ; RV64ZVE32F-NEXT: slli t2, t2, 48 ; RV64ZVE32F-NEXT: srli t2, t2, 45 ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: ld t2, 0(t2) ; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: bnez a5, .LBB53_18 -; RV64ZVE32F-NEXT: .LBB53_16: -; RV64ZVE32F-NEXT: ld a1, 56(a2) -; RV64ZVE32F-NEXT: j .LBB53_19 +; RV64ZVE32F-NEXT: bnez a5, .LBB53_19 ; RV64ZVE32F-NEXT: .LBB53_17: +; RV64ZVE32F-NEXT: ld a1, 56(a2) +; RV64ZVE32F-NEXT: j .LBB53_20 +; RV64ZVE32F-NEXT: .LBB53_18: ; RV64ZVE32F-NEXT: ld t2, 48(a2) ; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: beqz a5, .LBB53_16 -; RV64ZVE32F-NEXT: .LBB53_18: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a5, .LBB53_17 +; RV64ZVE32F-NEXT: .LBB53_19: # %cond.load19 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 48 ; RV64ZVE32F-NEXT: srli a2, a2, 45 ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: ld a1, 0(a1) -; RV64ZVE32F-NEXT: .LBB53_19: # %else20 +; RV64ZVE32F-NEXT: .LBB53_20: # %else20 ; RV64ZVE32F-NEXT: sd a3, 0(a0) ; RV64ZVE32F-NEXT: sd a4, 8(a0) ; RV64ZVE32F-NEXT: sd a6, 16(a0) @@ -5960,80 +5884,84 @@ define <8 x i64> @mgather_baseidx_v8i32_v8i64(ptr %base, <8 x i32> %idxs, <8 x i ; RV64ZVE32F-NEXT: add a4, a1, a4 ; RV64ZVE32F-NEXT: ld a4, 0(a4) ; RV64ZVE32F-NEXT: .LBB54_5: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: andi a6, a5, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: beqz a6, .LBB54_10 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: beqz a6, .LBB54_7 ; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a6, v8 +; RV64ZVE32F-NEXT: vmv.x.s a6, v10 ; RV64ZVE32F-NEXT: slli a6, a6, 3 ; RV64ZVE32F-NEXT: add a6, a1, a6 ; RV64ZVE32F-NEXT: ld a6, 0(a6) -; RV64ZVE32F-NEXT: andi a7, a5, 8 -; RV64ZVE32F-NEXT: bnez a7, .LBB54_11 +; RV64ZVE32F-NEXT: j .LBB54_8 ; RV64ZVE32F-NEXT: .LBB54_7: -; RV64ZVE32F-NEXT: ld a7, 24(a2) -; RV64ZVE32F-NEXT: andi t0, a5, 16 -; RV64ZVE32F-NEXT: bnez t0, .LBB54_12 -; RV64ZVE32F-NEXT: .LBB54_8: -; RV64ZVE32F-NEXT: ld t0, 32(a2) -; RV64ZVE32F-NEXT: andi t1, a5, 32 -; RV64ZVE32F-NEXT: bnez t1, .LBB54_13 -; RV64ZVE32F-NEXT: .LBB54_9: -; RV64ZVE32F-NEXT: ld t1, 40(a2) -; RV64ZVE32F-NEXT: j .LBB54_14 -; RV64ZVE32F-NEXT: .LBB54_10: ; RV64ZVE32F-NEXT: ld a6, 16(a2) +; RV64ZVE32F-NEXT: .LBB54_8: # %else5 ; RV64ZVE32F-NEXT: andi a7, a5, 8 -; RV64ZVE32F-NEXT: beqz a7, .LBB54_7 -; RV64ZVE32F-NEXT: .LBB54_11: # %cond.load7 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a7, v8 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz a7, .LBB54_12 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a7, v9 ; RV64ZVE32F-NEXT: slli a7, a7, 3 ; RV64ZVE32F-NEXT: add a7, a1, a7 ; RV64ZVE32F-NEXT: ld a7, 0(a7) ; RV64ZVE32F-NEXT: andi t0, a5, 16 -; RV64ZVE32F-NEXT: beqz t0, .LBB54_8 -; RV64ZVE32F-NEXT: .LBB54_12: # %cond.load10 -; RV64ZVE32F-NEXT: vmv.x.s t0, v10 +; RV64ZVE32F-NEXT: bnez t0, .LBB54_13 +; RV64ZVE32F-NEXT: .LBB54_10: +; RV64ZVE32F-NEXT: ld t0, 32(a2) +; RV64ZVE32F-NEXT: andi t1, a5, 32 +; RV64ZVE32F-NEXT: bnez t1, .LBB54_14 +; RV64ZVE32F-NEXT: .LBB54_11: +; RV64ZVE32F-NEXT: ld t1, 40(a2) +; RV64ZVE32F-NEXT: j .LBB54_15 +; RV64ZVE32F-NEXT: .LBB54_12: +; RV64ZVE32F-NEXT: ld a7, 24(a2) +; RV64ZVE32F-NEXT: andi t0, a5, 16 +; RV64ZVE32F-NEXT: beqz t0, .LBB54_10 +; RV64ZVE32F-NEXT: .LBB54_13: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vmv.x.s t0, v8 ; RV64ZVE32F-NEXT: slli t0, t0, 3 ; RV64ZVE32F-NEXT: add t0, a1, t0 ; RV64ZVE32F-NEXT: ld t0, 0(t0) ; RV64ZVE32F-NEXT: andi t1, a5, 32 -; RV64ZVE32F-NEXT: beqz t1, .LBB54_9 -; RV64ZVE32F-NEXT: .LBB54_13: # %cond.load13 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s t1, v8 +; RV64ZVE32F-NEXT: beqz t1, .LBB54_11 +; RV64ZVE32F-NEXT: .LBB54_14: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s t1, v9 ; RV64ZVE32F-NEXT: slli t1, t1, 3 ; RV64ZVE32F-NEXT: add t1, a1, t1 ; RV64ZVE32F-NEXT: ld t1, 0(t1) -; RV64ZVE32F-NEXT: .LBB54_14: # %else14 +; RV64ZVE32F-NEXT: .LBB54_15: # %else14 ; RV64ZVE32F-NEXT: andi t2, a5, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 -; RV64ZVE32F-NEXT: beqz t2, .LBB54_17 -; RV64ZVE32F-NEXT: # %bb.15: # %cond.load16 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz t2, .LBB54_18 +; RV64ZVE32F-NEXT: # %bb.16: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s t2, v8 ; RV64ZVE32F-NEXT: slli t2, t2, 3 ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: ld t2, 0(t2) ; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: bnez a5, .LBB54_18 -; RV64ZVE32F-NEXT: .LBB54_16: -; RV64ZVE32F-NEXT: ld a1, 56(a2) -; RV64ZVE32F-NEXT: j .LBB54_19 +; RV64ZVE32F-NEXT: bnez a5, .LBB54_19 ; RV64ZVE32F-NEXT: .LBB54_17: +; RV64ZVE32F-NEXT: ld a1, 56(a2) +; RV64ZVE32F-NEXT: j .LBB54_20 +; RV64ZVE32F-NEXT: .LBB54_18: ; RV64ZVE32F-NEXT: ld t2, 48(a2) ; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: beqz a5, .LBB54_16 -; RV64ZVE32F-NEXT: .LBB54_18: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a5, .LBB54_17 +; RV64ZVE32F-NEXT: .LBB54_19: # %cond.load19 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: ld a1, 0(a1) -; RV64ZVE32F-NEXT: .LBB54_19: # %else20 +; RV64ZVE32F-NEXT: .LBB54_20: # %else20 ; RV64ZVE32F-NEXT: sd a3, 0(a0) ; RV64ZVE32F-NEXT: sd a4, 8(a0) ; RV64ZVE32F-NEXT: sd a6, 16(a0) @@ -6237,80 +6165,84 @@ define <8 x i64> @mgather_baseidx_sext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, < ; RV64ZVE32F-NEXT: add a4, a1, a4 ; RV64ZVE32F-NEXT: ld a4, 0(a4) ; RV64ZVE32F-NEXT: .LBB55_5: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: andi a6, a5, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: beqz a6, .LBB55_10 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: beqz a6, .LBB55_7 ; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a6, v8 +; RV64ZVE32F-NEXT: vmv.x.s a6, v10 ; RV64ZVE32F-NEXT: slli a6, a6, 3 ; RV64ZVE32F-NEXT: add a6, a1, a6 ; RV64ZVE32F-NEXT: ld a6, 0(a6) -; RV64ZVE32F-NEXT: andi a7, a5, 8 -; RV64ZVE32F-NEXT: bnez a7, .LBB55_11 +; RV64ZVE32F-NEXT: j .LBB55_8 ; RV64ZVE32F-NEXT: .LBB55_7: -; RV64ZVE32F-NEXT: ld a7, 24(a2) -; RV64ZVE32F-NEXT: andi t0, a5, 16 -; RV64ZVE32F-NEXT: bnez t0, .LBB55_12 -; RV64ZVE32F-NEXT: .LBB55_8: -; RV64ZVE32F-NEXT: ld t0, 32(a2) -; RV64ZVE32F-NEXT: andi t1, a5, 32 -; RV64ZVE32F-NEXT: bnez t1, .LBB55_13 -; RV64ZVE32F-NEXT: .LBB55_9: -; RV64ZVE32F-NEXT: ld t1, 40(a2) -; RV64ZVE32F-NEXT: j .LBB55_14 -; RV64ZVE32F-NEXT: .LBB55_10: ; RV64ZVE32F-NEXT: ld a6, 16(a2) +; RV64ZVE32F-NEXT: .LBB55_8: # %else5 ; RV64ZVE32F-NEXT: andi a7, a5, 8 -; RV64ZVE32F-NEXT: beqz a7, .LBB55_7 -; RV64ZVE32F-NEXT: .LBB55_11: # %cond.load7 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a7, v8 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz a7, .LBB55_12 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a7, v9 ; RV64ZVE32F-NEXT: slli a7, a7, 3 ; RV64ZVE32F-NEXT: add a7, a1, a7 ; RV64ZVE32F-NEXT: ld a7, 0(a7) ; RV64ZVE32F-NEXT: andi t0, a5, 16 -; RV64ZVE32F-NEXT: beqz t0, .LBB55_8 -; RV64ZVE32F-NEXT: .LBB55_12: # %cond.load10 -; RV64ZVE32F-NEXT: vmv.x.s t0, v10 +; RV64ZVE32F-NEXT: bnez t0, .LBB55_13 +; RV64ZVE32F-NEXT: .LBB55_10: +; RV64ZVE32F-NEXT: ld t0, 32(a2) +; RV64ZVE32F-NEXT: andi t1, a5, 32 +; RV64ZVE32F-NEXT: bnez t1, .LBB55_14 +; RV64ZVE32F-NEXT: .LBB55_11: +; RV64ZVE32F-NEXT: ld t1, 40(a2) +; RV64ZVE32F-NEXT: j .LBB55_15 +; RV64ZVE32F-NEXT: .LBB55_12: +; RV64ZVE32F-NEXT: ld a7, 24(a2) +; RV64ZVE32F-NEXT: andi t0, a5, 16 +; RV64ZVE32F-NEXT: beqz t0, .LBB55_10 +; RV64ZVE32F-NEXT: .LBB55_13: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vmv.x.s t0, v8 ; RV64ZVE32F-NEXT: slli t0, t0, 3 ; RV64ZVE32F-NEXT: add t0, a1, t0 ; RV64ZVE32F-NEXT: ld t0, 0(t0) ; RV64ZVE32F-NEXT: andi t1, a5, 32 -; RV64ZVE32F-NEXT: beqz t1, .LBB55_9 -; RV64ZVE32F-NEXT: .LBB55_13: # %cond.load13 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s t1, v8 +; RV64ZVE32F-NEXT: beqz t1, .LBB55_11 +; RV64ZVE32F-NEXT: .LBB55_14: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s t1, v9 ; RV64ZVE32F-NEXT: slli t1, t1, 3 ; RV64ZVE32F-NEXT: add t1, a1, t1 ; RV64ZVE32F-NEXT: ld t1, 0(t1) -; RV64ZVE32F-NEXT: .LBB55_14: # %else14 +; RV64ZVE32F-NEXT: .LBB55_15: # %else14 ; RV64ZVE32F-NEXT: andi t2, a5, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 -; RV64ZVE32F-NEXT: beqz t2, .LBB55_17 -; RV64ZVE32F-NEXT: # %bb.15: # %cond.load16 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz t2, .LBB55_18 +; RV64ZVE32F-NEXT: # %bb.16: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s t2, v8 ; RV64ZVE32F-NEXT: slli t2, t2, 3 ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: ld t2, 0(t2) ; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: bnez a5, .LBB55_18 -; RV64ZVE32F-NEXT: .LBB55_16: -; RV64ZVE32F-NEXT: ld a1, 56(a2) -; RV64ZVE32F-NEXT: j .LBB55_19 +; RV64ZVE32F-NEXT: bnez a5, .LBB55_19 ; RV64ZVE32F-NEXT: .LBB55_17: +; RV64ZVE32F-NEXT: ld a1, 56(a2) +; RV64ZVE32F-NEXT: j .LBB55_20 +; RV64ZVE32F-NEXT: .LBB55_18: ; RV64ZVE32F-NEXT: ld t2, 48(a2) ; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: beqz a5, .LBB55_16 -; RV64ZVE32F-NEXT: .LBB55_18: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a5, .LBB55_17 +; RV64ZVE32F-NEXT: .LBB55_19: # %cond.load19 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: ld a1, 0(a1) -; RV64ZVE32F-NEXT: .LBB55_19: # %else20 +; RV64ZVE32F-NEXT: .LBB55_20: # %else20 ; RV64ZVE32F-NEXT: sd a3, 0(a0) ; RV64ZVE32F-NEXT: sd a4, 8(a0) ; RV64ZVE32F-NEXT: sd a6, 16(a0) @@ -6517,86 +6449,90 @@ define <8 x i64> @mgather_baseidx_zext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, < ; RV64ZVE32F-NEXT: add a4, a1, a4 ; RV64ZVE32F-NEXT: ld a4, 0(a4) ; RV64ZVE32F-NEXT: .LBB56_5: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: andi a6, a5, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: beqz a6, .LBB56_10 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: beqz a6, .LBB56_7 ; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a6, v8 +; RV64ZVE32F-NEXT: vmv.x.s a6, v10 ; RV64ZVE32F-NEXT: slli a6, a6, 32 ; RV64ZVE32F-NEXT: srli a6, a6, 29 ; RV64ZVE32F-NEXT: add a6, a1, a6 ; RV64ZVE32F-NEXT: ld a6, 0(a6) -; RV64ZVE32F-NEXT: andi a7, a5, 8 -; RV64ZVE32F-NEXT: bnez a7, .LBB56_11 +; RV64ZVE32F-NEXT: j .LBB56_8 ; RV64ZVE32F-NEXT: .LBB56_7: -; RV64ZVE32F-NEXT: ld a7, 24(a2) -; RV64ZVE32F-NEXT: andi t0, a5, 16 -; RV64ZVE32F-NEXT: bnez t0, .LBB56_12 -; RV64ZVE32F-NEXT: .LBB56_8: -; RV64ZVE32F-NEXT: ld t0, 32(a2) -; RV64ZVE32F-NEXT: andi t1, a5, 32 -; RV64ZVE32F-NEXT: bnez t1, .LBB56_13 -; RV64ZVE32F-NEXT: .LBB56_9: -; RV64ZVE32F-NEXT: ld t1, 40(a2) -; RV64ZVE32F-NEXT: j .LBB56_14 -; RV64ZVE32F-NEXT: .LBB56_10: ; RV64ZVE32F-NEXT: ld a6, 16(a2) +; RV64ZVE32F-NEXT: .LBB56_8: # %else5 ; RV64ZVE32F-NEXT: andi a7, a5, 8 -; RV64ZVE32F-NEXT: beqz a7, .LBB56_7 -; RV64ZVE32F-NEXT: .LBB56_11: # %cond.load7 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a7, v8 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz a7, .LBB56_12 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a7, v9 ; RV64ZVE32F-NEXT: slli a7, a7, 32 ; RV64ZVE32F-NEXT: srli a7, a7, 29 ; RV64ZVE32F-NEXT: add a7, a1, a7 ; RV64ZVE32F-NEXT: ld a7, 0(a7) ; RV64ZVE32F-NEXT: andi t0, a5, 16 -; RV64ZVE32F-NEXT: beqz t0, .LBB56_8 -; RV64ZVE32F-NEXT: .LBB56_12: # %cond.load10 -; RV64ZVE32F-NEXT: vmv.x.s t0, v10 +; RV64ZVE32F-NEXT: bnez t0, .LBB56_13 +; RV64ZVE32F-NEXT: .LBB56_10: +; RV64ZVE32F-NEXT: ld t0, 32(a2) +; RV64ZVE32F-NEXT: andi t1, a5, 32 +; RV64ZVE32F-NEXT: bnez t1, .LBB56_14 +; RV64ZVE32F-NEXT: .LBB56_11: +; RV64ZVE32F-NEXT: ld t1, 40(a2) +; RV64ZVE32F-NEXT: j .LBB56_15 +; RV64ZVE32F-NEXT: .LBB56_12: +; RV64ZVE32F-NEXT: ld a7, 24(a2) +; RV64ZVE32F-NEXT: andi t0, a5, 16 +; RV64ZVE32F-NEXT: beqz t0, .LBB56_10 +; RV64ZVE32F-NEXT: .LBB56_13: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vmv.x.s t0, v8 ; RV64ZVE32F-NEXT: slli t0, t0, 32 ; RV64ZVE32F-NEXT: srli t0, t0, 29 ; RV64ZVE32F-NEXT: add t0, a1, t0 ; RV64ZVE32F-NEXT: ld t0, 0(t0) ; RV64ZVE32F-NEXT: andi t1, a5, 32 -; RV64ZVE32F-NEXT: beqz t1, .LBB56_9 -; RV64ZVE32F-NEXT: .LBB56_13: # %cond.load13 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s t1, v8 +; RV64ZVE32F-NEXT: beqz t1, .LBB56_11 +; RV64ZVE32F-NEXT: .LBB56_14: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s t1, v9 ; RV64ZVE32F-NEXT: slli t1, t1, 32 ; RV64ZVE32F-NEXT: srli t1, t1, 29 ; RV64ZVE32F-NEXT: add t1, a1, t1 ; RV64ZVE32F-NEXT: ld t1, 0(t1) -; RV64ZVE32F-NEXT: .LBB56_14: # %else14 +; RV64ZVE32F-NEXT: .LBB56_15: # %else14 ; RV64ZVE32F-NEXT: andi t2, a5, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 -; RV64ZVE32F-NEXT: beqz t2, .LBB56_17 -; RV64ZVE32F-NEXT: # %bb.15: # %cond.load16 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz t2, .LBB56_18 +; RV64ZVE32F-NEXT: # %bb.16: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s t2, v8 ; RV64ZVE32F-NEXT: slli t2, t2, 32 ; RV64ZVE32F-NEXT: srli t2, t2, 29 ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: ld t2, 0(t2) ; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: bnez a5, .LBB56_18 -; RV64ZVE32F-NEXT: .LBB56_16: -; RV64ZVE32F-NEXT: ld a1, 56(a2) -; RV64ZVE32F-NEXT: j .LBB56_19 +; RV64ZVE32F-NEXT: bnez a5, .LBB56_19 ; RV64ZVE32F-NEXT: .LBB56_17: +; RV64ZVE32F-NEXT: ld a1, 56(a2) +; RV64ZVE32F-NEXT: j .LBB56_20 +; RV64ZVE32F-NEXT: .LBB56_18: ; RV64ZVE32F-NEXT: ld t2, 48(a2) ; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: beqz a5, .LBB56_16 -; RV64ZVE32F-NEXT: .LBB56_18: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a5, .LBB56_17 +; RV64ZVE32F-NEXT: .LBB56_19: # %cond.load19 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 32 ; RV64ZVE32F-NEXT: srli a2, a2, 29 ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: ld a1, 0(a1) -; RV64ZVE32F-NEXT: .LBB56_19: # %else20 +; RV64ZVE32F-NEXT: .LBB56_20: # %else20 ; RV64ZVE32F-NEXT: sd a3, 0(a0) ; RV64ZVE32F-NEXT: sd a4, 8(a0) ; RV64ZVE32F-NEXT: sd a6, 16(a0) @@ -7269,38 +7205,67 @@ define <8 x bfloat> @mgather_baseidx_v8i8_v8bf16(ptr %base, <8 x i8> %idxs, <8 x ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1 ; RV64ZVE32F-NEXT: .LBB64_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB64_14 -; RV64ZVE32F-NEXT: # %bb.5: # %else5 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB64_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vmv.s.x v11, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-NEXT: .LBB64_6: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB64_15 -; RV64ZVE32F-NEXT: .LBB64_6: # %else8 +; RV64ZVE32F-NEXT: beqz a2, .LBB64_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3 +; RV64ZVE32F-NEXT: .LBB64_8: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB64_16 -; RV64ZVE32F-NEXT: .LBB64_7: # %else11 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB64_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4 +; RV64ZVE32F-NEXT: .LBB64_10: # %else11 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB64_9 -; RV64ZVE32F-NEXT: .LBB64_8: # %cond.load13 +; RV64ZVE32F-NEXT: beqz a2, .LBB64_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) ; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 5 -; RV64ZVE32F-NEXT: .LBB64_9: # %else14 +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 5 +; RV64ZVE32F-NEXT: .LBB64_12: # %else14 ; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB64_11 -; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB64_14 +; RV64ZVE32F-NEXT: # %bb.13: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -7309,10 +7274,10 @@ define <8 x bfloat> @mgather_baseidx_v8i8_v8bf16(ptr %base, <8 x i8> %idxs, <8 x ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 6 -; RV64ZVE32F-NEXT: .LBB64_11: # %else17 +; RV64ZVE32F-NEXT: .LBB64_14: # %else17 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB64_13 -; RV64ZVE32F-NEXT: # %bb.12: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a1, .LBB64_16 +; RV64ZVE32F-NEXT: # %bb.15: # %cond.load19 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v8 @@ -7323,47 +7288,10 @@ define <8 x bfloat> @mgather_baseidx_v8i8_v8bf16(ptr %base, <8 x i8> %idxs, <8 x ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 7 -; RV64ZVE32F-NEXT: .LBB64_13: # %else20 +; RV64ZVE32F-NEXT: .LBB64_16: # %else20 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vmv1r.v v8, v9 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB64_14: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v11, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB64_6 -; RV64ZVE32F-NEXT: .LBB64_15: # %cond.load7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v8, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 3 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB64_7 -; RV64ZVE32F-NEXT: .LBB64_16: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v8, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB64_8 -; RV64ZVE32F-NEXT: j .LBB64_9 %ptrs = getelementptr inbounds bfloat, ptr %base, <8 x i8> %idxs %v = call <8 x bfloat> @llvm.masked.gather.v8bf16.v8p0(<8 x ptr> %ptrs, i32 2, <8 x i1> %m, <8 x bfloat> %passthru) ret <8 x bfloat> %v @@ -7418,38 +7346,67 @@ define <8 x bfloat> @mgather_baseidx_sext_v8i8_v8bf16(ptr %base, <8 x i8> %idxs, ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1 ; RV64ZVE32F-NEXT: .LBB65_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB65_14 -; RV64ZVE32F-NEXT: # %bb.5: # %else5 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB65_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vmv.s.x v11, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-NEXT: .LBB65_6: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB65_15 -; RV64ZVE32F-NEXT: .LBB65_6: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB65_16 -; RV64ZVE32F-NEXT: .LBB65_7: # %else11 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB65_9 -; RV64ZVE32F-NEXT: .LBB65_8: # %cond.load13 +; RV64ZVE32F-NEXT: beqz a2, .LBB65_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) ; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v8, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 5 -; RV64ZVE32F-NEXT: .LBB65_9: # %else14 -; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB65_11 -; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3 +; RV64ZVE32F-NEXT: .LBB65_8: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB65_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4 +; RV64ZVE32F-NEXT: .LBB65_10: # %else11 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB65_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 5 +; RV64ZVE32F-NEXT: .LBB65_12: # %else14 +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB65_14 +; RV64ZVE32F-NEXT: # %bb.13: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -7458,10 +7415,10 @@ define <8 x bfloat> @mgather_baseidx_sext_v8i8_v8bf16(ptr %base, <8 x i8> %idxs, ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 6 -; RV64ZVE32F-NEXT: .LBB65_11: # %else17 +; RV64ZVE32F-NEXT: .LBB65_14: # %else17 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB65_13 -; RV64ZVE32F-NEXT: # %bb.12: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a1, .LBB65_16 +; RV64ZVE32F-NEXT: # %bb.15: # %cond.load19 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v8 @@ -7472,47 +7429,10 @@ define <8 x bfloat> @mgather_baseidx_sext_v8i8_v8bf16(ptr %base, <8 x i8> %idxs, ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 7 -; RV64ZVE32F-NEXT: .LBB65_13: # %else20 +; RV64ZVE32F-NEXT: .LBB65_16: # %else20 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vmv1r.v v8, v9 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB65_14: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v11, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB65_6 -; RV64ZVE32F-NEXT: .LBB65_15: # %cond.load7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v8, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 3 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB65_7 -; RV64ZVE32F-NEXT: .LBB65_16: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v8, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB65_8 -; RV64ZVE32F-NEXT: j .LBB65_9 %eidxs = sext <8 x i8> %idxs to <8 x i16> %ptrs = getelementptr inbounds bfloat, ptr %base, <8 x i16> %eidxs %v = call <8 x bfloat> @llvm.masked.gather.v8bf16.v8p0(<8 x ptr> %ptrs, i32 2, <8 x i1> %m, <8 x bfloat> %passthru) @@ -7568,39 +7488,71 @@ define <8 x bfloat> @mgather_baseidx_zext_v8i8_v8bf16(ptr %base, <8 x i8> %idxs, ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1 ; RV64ZVE32F-NEXT: .LBB66_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB66_14 -; RV64ZVE32F-NEXT: # %bb.5: # %else5 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB66_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: zext.b a2, a2 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vmv.s.x v11, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-NEXT: .LBB66_6: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB66_15 -; RV64ZVE32F-NEXT: .LBB66_6: # %else8 +; RV64ZVE32F-NEXT: beqz a2, .LBB66_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: zext.b a2, a2 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3 +; RV64ZVE32F-NEXT: .LBB66_8: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB66_16 -; RV64ZVE32F-NEXT: .LBB66_7: # %else11 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB66_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: zext.b a2, a2 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4 +; RV64ZVE32F-NEXT: .LBB66_10: # %else11 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB66_9 -; RV64ZVE32F-NEXT: .LBB66_8: # %cond.load13 +; RV64ZVE32F-NEXT: beqz a2, .LBB66_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: zext.b a2, a2 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) ; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 5 -; RV64ZVE32F-NEXT: .LBB66_9: # %else14 +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 5 +; RV64ZVE32F-NEXT: .LBB66_12: # %else14 ; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB66_11 -; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB66_14 +; RV64ZVE32F-NEXT: # %bb.13: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: zext.b a2, a2 ; RV64ZVE32F-NEXT: slli a2, a2, 1 @@ -7610,10 +7562,10 @@ define <8 x bfloat> @mgather_baseidx_zext_v8i8_v8bf16(ptr %base, <8 x i8> %idxs, ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 6 -; RV64ZVE32F-NEXT: .LBB66_11: # %else17 +; RV64ZVE32F-NEXT: .LBB66_14: # %else17 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB66_13 -; RV64ZVE32F-NEXT: # %bb.12: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a1, .LBB66_16 +; RV64ZVE32F-NEXT: # %bb.15: # %cond.load19 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v8 @@ -7625,50 +7577,10 @@ define <8 x bfloat> @mgather_baseidx_zext_v8i8_v8bf16(ptr %base, <8 x i8> %idxs, ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 7 -; RV64ZVE32F-NEXT: .LBB66_13: # %else20 +; RV64ZVE32F-NEXT: .LBB66_16: # %else20 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vmv1r.v v8, v9 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB66_14: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: zext.b a2, a2 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v11, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB66_6 -; RV64ZVE32F-NEXT: .LBB66_15: # %cond.load7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: zext.b a2, a2 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v8, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 3 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB66_7 -; RV64ZVE32F-NEXT: .LBB66_16: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: zext.b a2, a2 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v8, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB66_8 -; RV64ZVE32F-NEXT: j .LBB66_9 %eidxs = zext <8 x i8> %idxs to <8 x i16> %ptrs = getelementptr inbounds bfloat, ptr %base, <8 x i16> %eidxs %v = call <8 x bfloat> @llvm.masked.gather.v8bf16.v8p0(<8 x ptr> %ptrs, i32 2, <8 x i1> %m, <8 x bfloat> %passthru) @@ -7721,37 +7633,63 @@ define <8 x bfloat> @mgather_baseidx_v8bf16(ptr %base, <8 x i16> %idxs, <8 x i1> ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1 ; RV64ZVE32F-NEXT: .LBB67_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB67_14 -; RV64ZVE32F-NEXT: # %bb.5: # %else5 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB67_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v11, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-NEXT: .LBB67_6: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB67_15 -; RV64ZVE32F-NEXT: .LBB67_6: # %else8 +; RV64ZVE32F-NEXT: beqz a2, .LBB67_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3 +; RV64ZVE32F-NEXT: .LBB67_8: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB67_16 -; RV64ZVE32F-NEXT: .LBB67_7: # %else11 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB67_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4 +; RV64ZVE32F-NEXT: .LBB67_10: # %else11 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB67_9 -; RV64ZVE32F-NEXT: .LBB67_8: # %cond.load13 +; RV64ZVE32F-NEXT: beqz a2, .LBB67_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 5 -; RV64ZVE32F-NEXT: .LBB67_9: # %else14 +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 5 +; RV64ZVE32F-NEXT: .LBB67_12: # %else14 ; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB67_11 -; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB67_14 +; RV64ZVE32F-NEXT: # %bb.13: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -7759,10 +7697,10 @@ define <8 x bfloat> @mgather_baseidx_v8bf16(ptr %base, <8 x i16> %idxs, <8 x i1> ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 6 -; RV64ZVE32F-NEXT: .LBB67_11: # %else17 +; RV64ZVE32F-NEXT: .LBB67_14: # %else17 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB67_13 -; RV64ZVE32F-NEXT: # %bb.12: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a1, .LBB67_16 +; RV64ZVE32F-NEXT: # %bb.15: # %cond.load19 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v8 @@ -7772,43 +7710,10 @@ define <8 x bfloat> @mgather_baseidx_v8bf16(ptr %base, <8 x i16> %idxs, <8 x i1> ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 7 -; RV64ZVE32F-NEXT: .LBB67_13: # %else20 +; RV64ZVE32F-NEXT: .LBB67_16: # %else20 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vmv1r.v v8, v9 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB67_14: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v11, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB67_6 -; RV64ZVE32F-NEXT: .LBB67_15: # %cond.load7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v8, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 3 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB67_7 -; RV64ZVE32F-NEXT: .LBB67_16: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v8, a2 -; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB67_8 -; RV64ZVE32F-NEXT: j .LBB67_9 %ptrs = getelementptr inbounds bfloat, ptr %base, <8 x i16> %idxs %v = call <8 x bfloat> @llvm.masked.gather.v8bf16.v8p0(<8 x ptr> %ptrs, i32 2, <8 x i1> %m, <8 x bfloat> %passthru) ret <8 x bfloat> %v @@ -8374,38 +8279,67 @@ define <8 x half> @mgather_baseidx_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8 x i1 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 2, e16, m1, tu, ma ; RV64ZVE32F-ZVFH-NEXT: vslideup.vi v9, v10, 1 ; RV64ZVE32F-ZVFH-NEXT: .LBB74_4: # %else2 -; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 4 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-ZVFH-NEXT: bnez a2, .LBB74_14 -; RV64ZVE32F-ZVFH-NEXT: # %bb.5: # %else5 +; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB74_6 +; RV64ZVE32F-ZVFH-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 +; RV64ZVE32F-ZVFH-NEXT: add a2, a0, a2 +; RV64ZVE32F-ZVFH-NEXT: flh fa5, 0(a2) +; RV64ZVE32F-ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-ZVFH-NEXT: vfmv.s.f v11, fa5 +; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 3, e16, m1, tu, ma +; RV64ZVE32F-ZVFH-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-ZVFH-NEXT: .LBB74_6: # %else5 ; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 8 -; RV64ZVE32F-ZVFH-NEXT: bnez a2, .LBB74_15 -; RV64ZVE32F-ZVFH-NEXT: .LBB74_6: # %else8 +; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB74_8 +; RV64ZVE32F-ZVFH-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 +; RV64ZVE32F-ZVFH-NEXT: add a2, a0, a2 +; RV64ZVE32F-ZVFH-NEXT: flh fa5, 0(a2) +; RV64ZVE32F-ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-ZVFH-NEXT: vfmv.s.f v10, fa5 +; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 4, e16, m1, tu, ma +; RV64ZVE32F-ZVFH-NEXT: vslideup.vi v9, v10, 3 +; RV64ZVE32F-ZVFH-NEXT: .LBB74_8: # %else8 ; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 16 -; RV64ZVE32F-ZVFH-NEXT: bnez a2, .LBB74_16 -; RV64ZVE32F-ZVFH-NEXT: .LBB74_7: # %else11 +; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB74_10 +; RV64ZVE32F-ZVFH-NEXT: # %bb.9: # %cond.load10 +; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 +; RV64ZVE32F-ZVFH-NEXT: add a2, a0, a2 +; RV64ZVE32F-ZVFH-NEXT: flh fa5, 0(a2) +; RV64ZVE32F-ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; RV64ZVE32F-ZVFH-NEXT: vfmv.s.f v10, fa5 +; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 5, e16, m1, tu, ma +; RV64ZVE32F-ZVFH-NEXT: vslideup.vi v9, v10, 4 +; RV64ZVE32F-ZVFH-NEXT: .LBB74_10: # %else11 ; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 32 -; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB74_9 -; RV64ZVE32F-ZVFH-NEXT: .LBB74_8: # %cond.load13 +; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB74_12 +; RV64ZVE32F-ZVFH-NEXT: # %bb.11: # %cond.load13 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v8, v10, 1 -; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 ; RV64ZVE32F-ZVFH-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFH-NEXT: flh fa5, 0(a2) ; RV64ZVE32F-ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vfmv.s.f v8, fa5 +; RV64ZVE32F-ZVFH-NEXT: vfmv.s.f v10, fa5 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 6, e16, m1, tu, ma -; RV64ZVE32F-ZVFH-NEXT: vslideup.vi v9, v8, 5 -; RV64ZVE32F-ZVFH-NEXT: .LBB74_9: # %else14 +; RV64ZVE32F-ZVFH-NEXT: vslideup.vi v9, v10, 5 +; RV64ZVE32F-ZVFH-NEXT: .LBB74_12: # %else14 ; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 64 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v8, v10, 2 -; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB74_11 -; RV64ZVE32F-ZVFH-NEXT: # %bb.10: # %cond.load16 +; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB74_14 +; RV64ZVE32F-ZVFH-NEXT: # %bb.13: # %cond.load16 ; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 ; RV64ZVE32F-ZVFH-NEXT: add a2, a0, a2 @@ -8414,10 +8348,10 @@ define <8 x half> @mgather_baseidx_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8 x i1 ; RV64ZVE32F-ZVFH-NEXT: vfmv.s.f v10, fa5 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 7, e16, m1, tu, ma ; RV64ZVE32F-ZVFH-NEXT: vslideup.vi v9, v10, 6 -; RV64ZVE32F-ZVFH-NEXT: .LBB74_11: # %else17 +; RV64ZVE32F-ZVFH-NEXT: .LBB74_14: # %else17 ; RV64ZVE32F-ZVFH-NEXT: andi a1, a1, -128 -; RV64ZVE32F-ZVFH-NEXT: beqz a1, .LBB74_13 -; RV64ZVE32F-ZVFH-NEXT: # %bb.12: # %cond.load19 +; RV64ZVE32F-ZVFH-NEXT: beqz a1, .LBB74_16 +; RV64ZVE32F-ZVFH-NEXT: # %bb.15: # %cond.load19 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a1, v8 @@ -8428,47 +8362,10 @@ define <8 x half> @mgather_baseidx_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8 x i1 ; RV64ZVE32F-ZVFH-NEXT: vfmv.s.f v8, fa5 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RV64ZVE32F-ZVFH-NEXT: vslideup.vi v9, v8, 7 -; RV64ZVE32F-ZVFH-NEXT: .LBB74_13: # %else20 +; RV64ZVE32F-ZVFH-NEXT: .LBB74_16: # %else20 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-ZVFH-NEXT: vmv1r.v v8, v9 ; RV64ZVE32F-ZVFH-NEXT: ret -; RV64ZVE32F-ZVFH-NEXT: .LBB74_14: # %cond.load4 -; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFH-NEXT: add a2, a0, a2 -; RV64ZVE32F-ZVFH-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vfmv.s.f v11, fa5 -; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 3, e16, m1, tu, ma -; RV64ZVE32F-ZVFH-NEXT: vslideup.vi v9, v11, 2 -; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 8 -; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB74_6 -; RV64ZVE32F-ZVFH-NEXT: .LBB74_15: # %cond.load7 -; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFH-NEXT: add a2, a0, a2 -; RV64ZVE32F-ZVFH-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vfmv.s.f v8, fa5 -; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 4, e16, m1, tu, ma -; RV64ZVE32F-ZVFH-NEXT: vslideup.vi v9, v8, 3 -; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 16 -; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB74_7 -; RV64ZVE32F-ZVFH-NEXT: .LBB74_16: # %cond.load10 -; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFH-NEXT: add a2, a0, a2 -; RV64ZVE32F-ZVFH-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vfmv.s.f v8, fa5 -; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 5, e16, m1, tu, ma -; RV64ZVE32F-ZVFH-NEXT: vslideup.vi v9, v8, 4 -; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 32 -; RV64ZVE32F-ZVFH-NEXT: bnez a2, .LBB74_8 -; RV64ZVE32F-ZVFH-NEXT: j .LBB74_9 ; ; RV64ZVE32F-ZVFHMIN-LABEL: mgather_baseidx_v8i8_v8f16: ; RV64ZVE32F-ZVFHMIN: # %bb.0: @@ -8498,50 +8395,79 @@ define <8 x half> @mgather_baseidx_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8 x i1 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 2, e16, m1, tu, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslideup.vi v9, v10, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB74_4: # %else2 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 4 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-ZVFHMIN-NEXT: bnez a2, .LBB74_14 -; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.5: # %else5 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 8 -; RV64ZVE32F-ZVFHMIN-NEXT: bnez a2, .LBB74_15 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB74_6: # %else8 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 16 -; RV64ZVE32F-ZVFHMIN-NEXT: bnez a2, .LBB74_16 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB74_7: # %else11 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 32 -; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB74_9 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB74_8: # %cond.load13 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v8, v10, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB74_6 +; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: lh a2, 0(a2) ; RV64ZVE32F-ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.s.x v8, a2 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 6, e16, m1, tu, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslideup.vi v9, v8, 5 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB74_9: # %else14 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 64 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v8, v10, 2 -; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB74_11 -; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.10: # %cond.load16 -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.s.x v11, a2 +; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 3, e16, m1, tu, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-ZVFHMIN-NEXT: .LBB74_6: # %else5 +; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 8 +; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB74_8 +; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: lh a2, 0(a2) ; RV64ZVE32F-ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.s.x v10, a2 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 7, e16, m1, tu, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 4, e16, m1, tu, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vslideup.vi v9, v10, 3 +; RV64ZVE32F-ZVFHMIN-NEXT: .LBB74_8: # %else8 +; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 16 +; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB74_10 +; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.9: # %cond.load10 +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 +; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 +; RV64ZVE32F-ZVFHMIN-NEXT: lh a2, 0(a2) +; RV64ZVE32F-ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 5, e16, m1, tu, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vslideup.vi v9, v10, 4 +; RV64ZVE32F-ZVFHMIN-NEXT: .LBB74_10: # %else11 +; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 32 +; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB74_12 +; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.11: # %cond.load13 +; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 +; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 +; RV64ZVE32F-ZVFHMIN-NEXT: lh a2, 0(a2) +; RV64ZVE32F-ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 6, e16, m1, tu, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vslideup.vi v9, v10, 5 +; RV64ZVE32F-ZVFHMIN-NEXT: .LBB74_12: # %else14 +; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 64 +; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB74_14 +; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.13: # %cond.load16 +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 +; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 +; RV64ZVE32F-ZVFHMIN-NEXT: lh a2, 0(a2) +; RV64ZVE32F-ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 7, e16, m1, tu, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslideup.vi v9, v10, 6 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB74_11: # %else17 +; RV64ZVE32F-ZVFHMIN-NEXT: .LBB74_14: # %else17 ; RV64ZVE32F-ZVFHMIN-NEXT: andi a1, a1, -128 -; RV64ZVE32F-ZVFHMIN-NEXT: beqz a1, .LBB74_13 -; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.12: # %cond.load19 +; RV64ZVE32F-ZVFHMIN-NEXT: beqz a1, .LBB74_16 +; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.15: # %cond.load19 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a1, v8 @@ -8552,47 +8478,10 @@ define <8 x half> @mgather_baseidx_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8 x i1 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslideup.vi v9, v8, 7 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB74_13: # %else20 +; RV64ZVE32F-ZVFHMIN-NEXT: .LBB74_16: # %else20 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vmv1r.v v8, v9 ; RV64ZVE32F-ZVFHMIN-NEXT: ret -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB74_14: # %cond.load4 -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 -; RV64ZVE32F-ZVFHMIN-NEXT: lh a2, 0(a2) -; RV64ZVE32F-ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.s.x v11, a2 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 3, e16, m1, tu, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslideup.vi v9, v11, 2 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 8 -; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB74_6 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB74_15: # %cond.load7 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 -; RV64ZVE32F-ZVFHMIN-NEXT: lh a2, 0(a2) -; RV64ZVE32F-ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.s.x v8, a2 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 4, e16, m1, tu, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslideup.vi v9, v8, 3 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 16 -; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB74_7 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB74_16: # %cond.load10 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 -; RV64ZVE32F-ZVFHMIN-NEXT: lh a2, 0(a2) -; RV64ZVE32F-ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.s.x v8, a2 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 5, e16, m1, tu, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslideup.vi v9, v8, 4 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 32 -; RV64ZVE32F-ZVFHMIN-NEXT: bnez a2, .LBB74_8 -; RV64ZVE32F-ZVFHMIN-NEXT: j .LBB74_9 %ptrs = getelementptr inbounds half, ptr %base, <8 x i8> %idxs %v = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> %ptrs, i32 2, <8 x i1> %m, <8 x half> %passthru) ret <8 x half> %v @@ -8647,38 +8536,67 @@ define <8 x half> @mgather_baseidx_sext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 2, e16, m1, tu, ma ; RV64ZVE32F-ZVFH-NEXT: vslideup.vi v9, v10, 1 ; RV64ZVE32F-ZVFH-NEXT: .LBB75_4: # %else2 -; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 4 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-ZVFH-NEXT: bnez a2, .LBB75_14 -; RV64ZVE32F-ZVFH-NEXT: # %bb.5: # %else5 +; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB75_6 +; RV64ZVE32F-ZVFH-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 +; RV64ZVE32F-ZVFH-NEXT: add a2, a0, a2 +; RV64ZVE32F-ZVFH-NEXT: flh fa5, 0(a2) +; RV64ZVE32F-ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-ZVFH-NEXT: vfmv.s.f v11, fa5 +; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 3, e16, m1, tu, ma +; RV64ZVE32F-ZVFH-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-ZVFH-NEXT: .LBB75_6: # %else5 ; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 8 -; RV64ZVE32F-ZVFH-NEXT: bnez a2, .LBB75_15 -; RV64ZVE32F-ZVFH-NEXT: .LBB75_6: # %else8 +; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB75_8 +; RV64ZVE32F-ZVFH-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 +; RV64ZVE32F-ZVFH-NEXT: add a2, a0, a2 +; RV64ZVE32F-ZVFH-NEXT: flh fa5, 0(a2) +; RV64ZVE32F-ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-ZVFH-NEXT: vfmv.s.f v10, fa5 +; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 4, e16, m1, tu, ma +; RV64ZVE32F-ZVFH-NEXT: vslideup.vi v9, v10, 3 +; RV64ZVE32F-ZVFH-NEXT: .LBB75_8: # %else8 ; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 16 -; RV64ZVE32F-ZVFH-NEXT: bnez a2, .LBB75_16 -; RV64ZVE32F-ZVFH-NEXT: .LBB75_7: # %else11 +; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB75_10 +; RV64ZVE32F-ZVFH-NEXT: # %bb.9: # %cond.load10 +; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 +; RV64ZVE32F-ZVFH-NEXT: add a2, a0, a2 +; RV64ZVE32F-ZVFH-NEXT: flh fa5, 0(a2) +; RV64ZVE32F-ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; RV64ZVE32F-ZVFH-NEXT: vfmv.s.f v10, fa5 +; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 5, e16, m1, tu, ma +; RV64ZVE32F-ZVFH-NEXT: vslideup.vi v9, v10, 4 +; RV64ZVE32F-ZVFH-NEXT: .LBB75_10: # %else11 ; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 32 -; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB75_9 -; RV64ZVE32F-ZVFH-NEXT: .LBB75_8: # %cond.load13 +; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB75_12 +; RV64ZVE32F-ZVFH-NEXT: # %bb.11: # %cond.load13 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v8, v10, 1 -; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 ; RV64ZVE32F-ZVFH-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFH-NEXT: flh fa5, 0(a2) ; RV64ZVE32F-ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vfmv.s.f v8, fa5 +; RV64ZVE32F-ZVFH-NEXT: vfmv.s.f v10, fa5 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 6, e16, m1, tu, ma -; RV64ZVE32F-ZVFH-NEXT: vslideup.vi v9, v8, 5 -; RV64ZVE32F-ZVFH-NEXT: .LBB75_9: # %else14 +; RV64ZVE32F-ZVFH-NEXT: vslideup.vi v9, v10, 5 +; RV64ZVE32F-ZVFH-NEXT: .LBB75_12: # %else14 ; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 64 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v8, v10, 2 -; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB75_11 -; RV64ZVE32F-ZVFH-NEXT: # %bb.10: # %cond.load16 +; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB75_14 +; RV64ZVE32F-ZVFH-NEXT: # %bb.13: # %cond.load16 ; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 ; RV64ZVE32F-ZVFH-NEXT: add a2, a0, a2 @@ -8687,10 +8605,10 @@ define <8 x half> @mgather_baseidx_sext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-ZVFH-NEXT: vfmv.s.f v10, fa5 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 7, e16, m1, tu, ma ; RV64ZVE32F-ZVFH-NEXT: vslideup.vi v9, v10, 6 -; RV64ZVE32F-ZVFH-NEXT: .LBB75_11: # %else17 +; RV64ZVE32F-ZVFH-NEXT: .LBB75_14: # %else17 ; RV64ZVE32F-ZVFH-NEXT: andi a1, a1, -128 -; RV64ZVE32F-ZVFH-NEXT: beqz a1, .LBB75_13 -; RV64ZVE32F-ZVFH-NEXT: # %bb.12: # %cond.load19 +; RV64ZVE32F-ZVFH-NEXT: beqz a1, .LBB75_16 +; RV64ZVE32F-ZVFH-NEXT: # %bb.15: # %cond.load19 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a1, v8 @@ -8701,47 +8619,10 @@ define <8 x half> @mgather_baseidx_sext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-ZVFH-NEXT: vfmv.s.f v8, fa5 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RV64ZVE32F-ZVFH-NEXT: vslideup.vi v9, v8, 7 -; RV64ZVE32F-ZVFH-NEXT: .LBB75_13: # %else20 +; RV64ZVE32F-ZVFH-NEXT: .LBB75_16: # %else20 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-ZVFH-NEXT: vmv1r.v v8, v9 ; RV64ZVE32F-ZVFH-NEXT: ret -; RV64ZVE32F-ZVFH-NEXT: .LBB75_14: # %cond.load4 -; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFH-NEXT: add a2, a0, a2 -; RV64ZVE32F-ZVFH-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vfmv.s.f v11, fa5 -; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 3, e16, m1, tu, ma -; RV64ZVE32F-ZVFH-NEXT: vslideup.vi v9, v11, 2 -; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 8 -; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB75_6 -; RV64ZVE32F-ZVFH-NEXT: .LBB75_15: # %cond.load7 -; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFH-NEXT: add a2, a0, a2 -; RV64ZVE32F-ZVFH-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vfmv.s.f v8, fa5 -; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 4, e16, m1, tu, ma -; RV64ZVE32F-ZVFH-NEXT: vslideup.vi v9, v8, 3 -; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 16 -; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB75_7 -; RV64ZVE32F-ZVFH-NEXT: .LBB75_16: # %cond.load10 -; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFH-NEXT: add a2, a0, a2 -; RV64ZVE32F-ZVFH-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vfmv.s.f v8, fa5 -; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 5, e16, m1, tu, ma -; RV64ZVE32F-ZVFH-NEXT: vslideup.vi v9, v8, 4 -; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 32 -; RV64ZVE32F-ZVFH-NEXT: bnez a2, .LBB75_8 -; RV64ZVE32F-ZVFH-NEXT: j .LBB75_9 ; ; RV64ZVE32F-ZVFHMIN-LABEL: mgather_baseidx_sext_v8i8_v8f16: ; RV64ZVE32F-ZVFHMIN: # %bb.0: @@ -8771,38 +8652,67 @@ define <8 x half> @mgather_baseidx_sext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 2, e16, m1, tu, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslideup.vi v9, v10, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB75_4: # %else2 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 4 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-ZVFHMIN-NEXT: bnez a2, .LBB75_14 -; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.5: # %else5 +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB75_6 +; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 +; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 +; RV64ZVE32F-ZVFHMIN-NEXT: lh a2, 0(a2) +; RV64ZVE32F-ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.s.x v11, a2 +; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 3, e16, m1, tu, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-ZVFHMIN-NEXT: .LBB75_6: # %else5 ; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 8 -; RV64ZVE32F-ZVFHMIN-NEXT: bnez a2, .LBB75_15 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB75_6: # %else8 +; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB75_8 +; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 +; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 +; RV64ZVE32F-ZVFHMIN-NEXT: lh a2, 0(a2) +; RV64ZVE32F-ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 4, e16, m1, tu, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vslideup.vi v9, v10, 3 +; RV64ZVE32F-ZVFHMIN-NEXT: .LBB75_8: # %else8 ; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 16 -; RV64ZVE32F-ZVFHMIN-NEXT: bnez a2, .LBB75_16 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB75_7: # %else11 +; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB75_10 +; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.9: # %cond.load10 +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 +; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 +; RV64ZVE32F-ZVFHMIN-NEXT: lh a2, 0(a2) +; RV64ZVE32F-ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 5, e16, m1, tu, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vslideup.vi v9, v10, 4 +; RV64ZVE32F-ZVFHMIN-NEXT: .LBB75_10: # %else11 ; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 32 -; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB75_9 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB75_8: # %cond.load13 +; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB75_12 +; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.11: # %cond.load13 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v8, v10, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: lh a2, 0(a2) ; RV64ZVE32F-ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 6, e16, m1, tu, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslideup.vi v9, v8, 5 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB75_9: # %else14 +; RV64ZVE32F-ZVFHMIN-NEXT: vslideup.vi v9, v10, 5 +; RV64ZVE32F-ZVFHMIN-NEXT: .LBB75_12: # %else14 ; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 64 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v8, v10, 2 -; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB75_11 -; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.10: # %cond.load16 +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB75_14 +; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.13: # %cond.load16 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 @@ -8811,10 +8721,10 @@ define <8 x half> @mgather_baseidx_sext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 7, e16, m1, tu, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslideup.vi v9, v10, 6 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB75_11: # %else17 +; RV64ZVE32F-ZVFHMIN-NEXT: .LBB75_14: # %else17 ; RV64ZVE32F-ZVFHMIN-NEXT: andi a1, a1, -128 -; RV64ZVE32F-ZVFHMIN-NEXT: beqz a1, .LBB75_13 -; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.12: # %cond.load19 +; RV64ZVE32F-ZVFHMIN-NEXT: beqz a1, .LBB75_16 +; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.15: # %cond.load19 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a1, v8 @@ -8825,47 +8735,10 @@ define <8 x half> @mgather_baseidx_sext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslideup.vi v9, v8, 7 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB75_13: # %else20 +; RV64ZVE32F-ZVFHMIN-NEXT: .LBB75_16: # %else20 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vmv1r.v v8, v9 ; RV64ZVE32F-ZVFHMIN-NEXT: ret -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB75_14: # %cond.load4 -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 -; RV64ZVE32F-ZVFHMIN-NEXT: lh a2, 0(a2) -; RV64ZVE32F-ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.s.x v11, a2 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 3, e16, m1, tu, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslideup.vi v9, v11, 2 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 8 -; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB75_6 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB75_15: # %cond.load7 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 -; RV64ZVE32F-ZVFHMIN-NEXT: lh a2, 0(a2) -; RV64ZVE32F-ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.s.x v8, a2 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 4, e16, m1, tu, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslideup.vi v9, v8, 3 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 16 -; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB75_7 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB75_16: # %cond.load10 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 -; RV64ZVE32F-ZVFHMIN-NEXT: lh a2, 0(a2) -; RV64ZVE32F-ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.s.x v8, a2 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 5, e16, m1, tu, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslideup.vi v9, v8, 4 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 32 -; RV64ZVE32F-ZVFHMIN-NEXT: bnez a2, .LBB75_8 -; RV64ZVE32F-ZVFHMIN-NEXT: j .LBB75_9 %eidxs = sext <8 x i8> %idxs to <8 x i16> %ptrs = getelementptr inbounds half, ptr %base, <8 x i16> %eidxs %v = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> %ptrs, i32 2, <8 x i1> %m, <8 x half> %passthru) @@ -8921,39 +8794,71 @@ define <8 x half> @mgather_baseidx_zext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 2, e16, m1, tu, ma ; RV64ZVE32F-ZVFH-NEXT: vslideup.vi v9, v10, 1 ; RV64ZVE32F-ZVFH-NEXT: .LBB76_4: # %else2 -; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 4 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-ZVFH-NEXT: bnez a2, .LBB76_14 -; RV64ZVE32F-ZVFH-NEXT: # %bb.5: # %else5 +; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB76_6 +; RV64ZVE32F-ZVFH-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-ZVFH-NEXT: zext.b a2, a2 +; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 +; RV64ZVE32F-ZVFH-NEXT: add a2, a0, a2 +; RV64ZVE32F-ZVFH-NEXT: flh fa5, 0(a2) +; RV64ZVE32F-ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-ZVFH-NEXT: vfmv.s.f v11, fa5 +; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 3, e16, m1, tu, ma +; RV64ZVE32F-ZVFH-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-ZVFH-NEXT: .LBB76_6: # %else5 ; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 8 -; RV64ZVE32F-ZVFH-NEXT: bnez a2, .LBB76_15 -; RV64ZVE32F-ZVFH-NEXT: .LBB76_6: # %else8 +; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB76_8 +; RV64ZVE32F-ZVFH-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-ZVFH-NEXT: zext.b a2, a2 +; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 +; RV64ZVE32F-ZVFH-NEXT: add a2, a0, a2 +; RV64ZVE32F-ZVFH-NEXT: flh fa5, 0(a2) +; RV64ZVE32F-ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-ZVFH-NEXT: vfmv.s.f v10, fa5 +; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 4, e16, m1, tu, ma +; RV64ZVE32F-ZVFH-NEXT: vslideup.vi v9, v10, 3 +; RV64ZVE32F-ZVFH-NEXT: .LBB76_8: # %else8 ; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 16 -; RV64ZVE32F-ZVFH-NEXT: bnez a2, .LBB76_16 -; RV64ZVE32F-ZVFH-NEXT: .LBB76_7: # %else11 +; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB76_10 +; RV64ZVE32F-ZVFH-NEXT: # %bb.9: # %cond.load10 +; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-ZVFH-NEXT: zext.b a2, a2 +; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 +; RV64ZVE32F-ZVFH-NEXT: add a2, a0, a2 +; RV64ZVE32F-ZVFH-NEXT: flh fa5, 0(a2) +; RV64ZVE32F-ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; RV64ZVE32F-ZVFH-NEXT: vfmv.s.f v10, fa5 +; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 5, e16, m1, tu, ma +; RV64ZVE32F-ZVFH-NEXT: vslideup.vi v9, v10, 4 +; RV64ZVE32F-ZVFH-NEXT: .LBB76_10: # %else11 ; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 32 -; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB76_9 -; RV64ZVE32F-ZVFH-NEXT: .LBB76_8: # %cond.load13 +; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB76_12 +; RV64ZVE32F-ZVFH-NEXT: # %bb.11: # %cond.load13 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v8, v10, 1 -; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-ZVFH-NEXT: zext.b a2, a2 ; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 ; RV64ZVE32F-ZVFH-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFH-NEXT: flh fa5, 0(a2) ; RV64ZVE32F-ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vfmv.s.f v8, fa5 +; RV64ZVE32F-ZVFH-NEXT: vfmv.s.f v10, fa5 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 6, e16, m1, tu, ma -; RV64ZVE32F-ZVFH-NEXT: vslideup.vi v9, v8, 5 -; RV64ZVE32F-ZVFH-NEXT: .LBB76_9: # %else14 +; RV64ZVE32F-ZVFH-NEXT: vslideup.vi v9, v10, 5 +; RV64ZVE32F-ZVFH-NEXT: .LBB76_12: # %else14 ; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 64 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v8, v10, 2 -; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB76_11 -; RV64ZVE32F-ZVFH-NEXT: # %bb.10: # %cond.load16 +; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB76_14 +; RV64ZVE32F-ZVFH-NEXT: # %bb.13: # %cond.load16 ; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-ZVFH-NEXT: zext.b a2, a2 ; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 @@ -8963,10 +8868,10 @@ define <8 x half> @mgather_baseidx_zext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-ZVFH-NEXT: vfmv.s.f v10, fa5 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 7, e16, m1, tu, ma ; RV64ZVE32F-ZVFH-NEXT: vslideup.vi v9, v10, 6 -; RV64ZVE32F-ZVFH-NEXT: .LBB76_11: # %else17 +; RV64ZVE32F-ZVFH-NEXT: .LBB76_14: # %else17 ; RV64ZVE32F-ZVFH-NEXT: andi a1, a1, -128 -; RV64ZVE32F-ZVFH-NEXT: beqz a1, .LBB76_13 -; RV64ZVE32F-ZVFH-NEXT: # %bb.12: # %cond.load19 +; RV64ZVE32F-ZVFH-NEXT: beqz a1, .LBB76_16 +; RV64ZVE32F-ZVFH-NEXT: # %bb.15: # %cond.load19 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a1, v8 @@ -8978,50 +8883,10 @@ define <8 x half> @mgather_baseidx_zext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-ZVFH-NEXT: vfmv.s.f v8, fa5 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RV64ZVE32F-ZVFH-NEXT: vslideup.vi v9, v8, 7 -; RV64ZVE32F-ZVFH-NEXT: .LBB76_13: # %else20 +; RV64ZVE32F-ZVFH-NEXT: .LBB76_16: # %else20 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-ZVFH-NEXT: vmv1r.v v8, v9 ; RV64ZVE32F-ZVFH-NEXT: ret -; RV64ZVE32F-ZVFH-NEXT: .LBB76_14: # %cond.load4 -; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-ZVFH-NEXT: zext.b a2, a2 -; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFH-NEXT: add a2, a0, a2 -; RV64ZVE32F-ZVFH-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vfmv.s.f v11, fa5 -; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 3, e16, m1, tu, ma -; RV64ZVE32F-ZVFH-NEXT: vslideup.vi v9, v11, 2 -; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 8 -; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB76_6 -; RV64ZVE32F-ZVFH-NEXT: .LBB76_15: # %cond.load7 -; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-ZVFH-NEXT: zext.b a2, a2 -; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFH-NEXT: add a2, a0, a2 -; RV64ZVE32F-ZVFH-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vfmv.s.f v8, fa5 -; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 4, e16, m1, tu, ma -; RV64ZVE32F-ZVFH-NEXT: vslideup.vi v9, v8, 3 -; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 16 -; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB76_7 -; RV64ZVE32F-ZVFH-NEXT: .LBB76_16: # %cond.load10 -; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-ZVFH-NEXT: zext.b a2, a2 -; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFH-NEXT: add a2, a0, a2 -; RV64ZVE32F-ZVFH-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vfmv.s.f v8, fa5 -; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 5, e16, m1, tu, ma -; RV64ZVE32F-ZVFH-NEXT: vslideup.vi v9, v8, 4 -; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 32 -; RV64ZVE32F-ZVFH-NEXT: bnez a2, .LBB76_8 -; RV64ZVE32F-ZVFH-NEXT: j .LBB76_9 ; ; RV64ZVE32F-ZVFHMIN-LABEL: mgather_baseidx_zext_v8i8_v8f16: ; RV64ZVE32F-ZVFHMIN: # %bb.0: @@ -9053,39 +8918,71 @@ define <8 x half> @mgather_baseidx_zext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 2, e16, m1, tu, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslideup.vi v9, v10, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB76_4: # %else2 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 4 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-ZVFHMIN-NEXT: bnez a2, .LBB76_14 -; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.5: # %else5 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 8 -; RV64ZVE32F-ZVFHMIN-NEXT: bnez a2, .LBB76_15 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB76_6: # %else8 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 16 -; RV64ZVE32F-ZVFHMIN-NEXT: bnez a2, .LBB76_16 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB76_7: # %else11 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 32 -; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB76_9 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB76_8: # %cond.load13 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v8, v10, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB76_6 +; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-ZVFHMIN-NEXT: zext.b a2, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: lh a2, 0(a2) ; RV64ZVE32F-ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.s.x v11, a2 +; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 3, e16, m1, tu, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-ZVFHMIN-NEXT: .LBB76_6: # %else5 +; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 8 +; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB76_8 +; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-ZVFHMIN-NEXT: zext.b a2, a2 +; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 +; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 +; RV64ZVE32F-ZVFHMIN-NEXT: lh a2, 0(a2) +; RV64ZVE32F-ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 4, e16, m1, tu, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vslideup.vi v9, v10, 3 +; RV64ZVE32F-ZVFHMIN-NEXT: .LBB76_8: # %else8 +; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 16 +; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB76_10 +; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.9: # %cond.load10 +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-ZVFHMIN-NEXT: zext.b a2, a2 +; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 +; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 +; RV64ZVE32F-ZVFHMIN-NEXT: lh a2, 0(a2) +; RV64ZVE32F-ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 5, e16, m1, tu, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vslideup.vi v9, v10, 4 +; RV64ZVE32F-ZVFHMIN-NEXT: .LBB76_10: # %else11 +; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 32 +; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB76_12 +; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.11: # %cond.load13 +; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-ZVFHMIN-NEXT: zext.b a2, a2 +; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 +; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 +; RV64ZVE32F-ZVFHMIN-NEXT: lh a2, 0(a2) +; RV64ZVE32F-ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 6, e16, m1, tu, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslideup.vi v9, v8, 5 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB76_9: # %else14 +; RV64ZVE32F-ZVFHMIN-NEXT: vslideup.vi v9, v10, 5 +; RV64ZVE32F-ZVFHMIN-NEXT: .LBB76_12: # %else14 ; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 64 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v8, v10, 2 -; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB76_11 -; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.10: # %cond.load16 +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB76_14 +; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.13: # %cond.load16 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-ZVFHMIN-NEXT: zext.b a2, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 @@ -9095,10 +8992,10 @@ define <8 x half> @mgather_baseidx_zext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 7, e16, m1, tu, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslideup.vi v9, v10, 6 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB76_11: # %else17 +; RV64ZVE32F-ZVFHMIN-NEXT: .LBB76_14: # %else17 ; RV64ZVE32F-ZVFHMIN-NEXT: andi a1, a1, -128 -; RV64ZVE32F-ZVFHMIN-NEXT: beqz a1, .LBB76_13 -; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.12: # %cond.load19 +; RV64ZVE32F-ZVFHMIN-NEXT: beqz a1, .LBB76_16 +; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.15: # %cond.load19 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a1, v8 @@ -9110,50 +9007,10 @@ define <8 x half> @mgather_baseidx_zext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslideup.vi v9, v8, 7 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB76_13: # %else20 +; RV64ZVE32F-ZVFHMIN-NEXT: .LBB76_16: # %else20 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vmv1r.v v8, v9 ; RV64ZVE32F-ZVFHMIN-NEXT: ret -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB76_14: # %cond.load4 -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-ZVFHMIN-NEXT: zext.b a2, a2 -; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 -; RV64ZVE32F-ZVFHMIN-NEXT: lh a2, 0(a2) -; RV64ZVE32F-ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.s.x v11, a2 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 3, e16, m1, tu, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslideup.vi v9, v11, 2 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 8 -; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB76_6 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB76_15: # %cond.load7 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-ZVFHMIN-NEXT: zext.b a2, a2 -; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 -; RV64ZVE32F-ZVFHMIN-NEXT: lh a2, 0(a2) -; RV64ZVE32F-ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.s.x v8, a2 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 4, e16, m1, tu, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslideup.vi v9, v8, 3 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 16 -; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB76_7 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB76_16: # %cond.load10 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-ZVFHMIN-NEXT: zext.b a2, a2 -; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 -; RV64ZVE32F-ZVFHMIN-NEXT: lh a2, 0(a2) -; RV64ZVE32F-ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.s.x v8, a2 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 5, e16, m1, tu, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslideup.vi v9, v8, 4 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 32 -; RV64ZVE32F-ZVFHMIN-NEXT: bnez a2, .LBB76_8 -; RV64ZVE32F-ZVFHMIN-NEXT: j .LBB76_9 %eidxs = zext <8 x i8> %idxs to <8 x i16> %ptrs = getelementptr inbounds half, ptr %base, <8 x i16> %eidxs %v = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> %ptrs, i32 2, <8 x i1> %m, <8 x half> %passthru) @@ -9206,37 +9063,63 @@ define <8 x half> @mgather_baseidx_v8f16(ptr %base, <8 x i16> %idxs, <8 x i1> %m ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 2, e16, m1, tu, ma ; RV64ZVE32F-ZVFH-NEXT: vslideup.vi v9, v10, 1 ; RV64ZVE32F-ZVFH-NEXT: .LBB77_4: # %else2 -; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 4 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-ZVFH-NEXT: bnez a2, .LBB77_14 -; RV64ZVE32F-ZVFH-NEXT: # %bb.5: # %else5 +; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB77_6 +; RV64ZVE32F-ZVFH-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 +; RV64ZVE32F-ZVFH-NEXT: add a2, a0, a2 +; RV64ZVE32F-ZVFH-NEXT: flh fa5, 0(a2) +; RV64ZVE32F-ZVFH-NEXT: vfmv.s.f v11, fa5 +; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 3, e16, m1, tu, ma +; RV64ZVE32F-ZVFH-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-ZVFH-NEXT: .LBB77_6: # %else5 ; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 8 -; RV64ZVE32F-ZVFH-NEXT: bnez a2, .LBB77_15 -; RV64ZVE32F-ZVFH-NEXT: .LBB77_6: # %else8 +; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB77_8 +; RV64ZVE32F-ZVFH-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e16, mf2, ta, ma +; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 +; RV64ZVE32F-ZVFH-NEXT: add a2, a0, a2 +; RV64ZVE32F-ZVFH-NEXT: flh fa5, 0(a2) +; RV64ZVE32F-ZVFH-NEXT: vfmv.s.f v10, fa5 +; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 4, e16, m1, tu, ma +; RV64ZVE32F-ZVFH-NEXT: vslideup.vi v9, v10, 3 +; RV64ZVE32F-ZVFH-NEXT: .LBB77_8: # %else8 ; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 16 -; RV64ZVE32F-ZVFH-NEXT: bnez a2, .LBB77_16 -; RV64ZVE32F-ZVFH-NEXT: .LBB77_7: # %else11 +; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB77_10 +; RV64ZVE32F-ZVFH-NEXT: # %bb.9: # %cond.load10 +; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 +; RV64ZVE32F-ZVFH-NEXT: add a2, a0, a2 +; RV64ZVE32F-ZVFH-NEXT: flh fa5, 0(a2) +; RV64ZVE32F-ZVFH-NEXT: vfmv.s.f v10, fa5 +; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 5, e16, m1, tu, ma +; RV64ZVE32F-ZVFH-NEXT: vslideup.vi v9, v10, 4 +; RV64ZVE32F-ZVFH-NEXT: .LBB77_10: # %else11 ; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 32 -; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB77_9 -; RV64ZVE32F-ZVFH-NEXT: .LBB77_8: # %cond.load13 +; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB77_12 +; RV64ZVE32F-ZVFH-NEXT: # %bb.11: # %cond.load13 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v8, v10, 1 -; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 ; RV64ZVE32F-ZVFH-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFH-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-ZVFH-NEXT: vfmv.s.f v8, fa5 +; RV64ZVE32F-ZVFH-NEXT: vfmv.s.f v10, fa5 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 6, e16, m1, tu, ma -; RV64ZVE32F-ZVFH-NEXT: vslideup.vi v9, v8, 5 -; RV64ZVE32F-ZVFH-NEXT: .LBB77_9: # %else14 +; RV64ZVE32F-ZVFH-NEXT: vslideup.vi v9, v10, 5 +; RV64ZVE32F-ZVFH-NEXT: .LBB77_12: # %else14 ; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 64 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v8, v10, 2 -; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB77_11 -; RV64ZVE32F-ZVFH-NEXT: # %bb.10: # %cond.load16 +; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB77_14 +; RV64ZVE32F-ZVFH-NEXT: # %bb.13: # %cond.load16 ; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 ; RV64ZVE32F-ZVFH-NEXT: add a2, a0, a2 @@ -9244,10 +9127,10 @@ define <8 x half> @mgather_baseidx_v8f16(ptr %base, <8 x i16> %idxs, <8 x i1> %m ; RV64ZVE32F-ZVFH-NEXT: vfmv.s.f v10, fa5 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 7, e16, m1, tu, ma ; RV64ZVE32F-ZVFH-NEXT: vslideup.vi v9, v10, 6 -; RV64ZVE32F-ZVFH-NEXT: .LBB77_11: # %else17 +; RV64ZVE32F-ZVFH-NEXT: .LBB77_14: # %else17 ; RV64ZVE32F-ZVFH-NEXT: andi a1, a1, -128 -; RV64ZVE32F-ZVFH-NEXT: beqz a1, .LBB77_13 -; RV64ZVE32F-ZVFH-NEXT: # %bb.12: # %cond.load19 +; RV64ZVE32F-ZVFH-NEXT: beqz a1, .LBB77_16 +; RV64ZVE32F-ZVFH-NEXT: # %bb.15: # %cond.load19 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a1, v8 @@ -9257,43 +9140,10 @@ define <8 x half> @mgather_baseidx_v8f16(ptr %base, <8 x i16> %idxs, <8 x i1> %m ; RV64ZVE32F-ZVFH-NEXT: vfmv.s.f v8, fa5 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RV64ZVE32F-ZVFH-NEXT: vslideup.vi v9, v8, 7 -; RV64ZVE32F-ZVFH-NEXT: .LBB77_13: # %else20 +; RV64ZVE32F-ZVFH-NEXT: .LBB77_16: # %else20 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-ZVFH-NEXT: vmv1r.v v8, v9 ; RV64ZVE32F-ZVFH-NEXT: ret -; RV64ZVE32F-ZVFH-NEXT: .LBB77_14: # %cond.load4 -; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFH-NEXT: add a2, a0, a2 -; RV64ZVE32F-ZVFH-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-ZVFH-NEXT: vfmv.s.f v11, fa5 -; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 3, e16, m1, tu, ma -; RV64ZVE32F-ZVFH-NEXT: vslideup.vi v9, v11, 2 -; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 8 -; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB77_6 -; RV64ZVE32F-ZVFH-NEXT: .LBB77_15: # %cond.load7 -; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFH-NEXT: add a2, a0, a2 -; RV64ZVE32F-ZVFH-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-ZVFH-NEXT: vfmv.s.f v8, fa5 -; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 4, e16, m1, tu, ma -; RV64ZVE32F-ZVFH-NEXT: vslideup.vi v9, v8, 3 -; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 16 -; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB77_7 -; RV64ZVE32F-ZVFH-NEXT: .LBB77_16: # %cond.load10 -; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 5, e16, m1, tu, ma -; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFH-NEXT: add a2, a0, a2 -; RV64ZVE32F-ZVFH-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-ZVFH-NEXT: vfmv.s.f v8, fa5 -; RV64ZVE32F-ZVFH-NEXT: vslideup.vi v9, v8, 4 -; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 32 -; RV64ZVE32F-ZVFH-NEXT: bnez a2, .LBB77_8 -; RV64ZVE32F-ZVFH-NEXT: j .LBB77_9 ; ; RV64ZVE32F-ZVFHMIN-LABEL: mgather_baseidx_v8f16: ; RV64ZVE32F-ZVFHMIN: # %bb.0: @@ -9322,37 +9172,63 @@ define <8 x half> @mgather_baseidx_v8f16(ptr %base, <8 x i16> %idxs, <8 x i1> %m ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 2, e16, m1, tu, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslideup.vi v9, v10, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB77_4: # %else2 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 4 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-ZVFHMIN-NEXT: bnez a2, .LBB77_14 -; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.5: # %else5 +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB77_6 +; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 +; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 +; RV64ZVE32F-ZVFHMIN-NEXT: lh a2, 0(a2) +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.s.x v11, a2 +; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 3, e16, m1, tu, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-ZVFHMIN-NEXT: .LBB77_6: # %else5 ; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 8 -; RV64ZVE32F-ZVFHMIN-NEXT: bnez a2, .LBB77_15 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB77_6: # %else8 +; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB77_8 +; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, mf2, ta, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 +; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 +; RV64ZVE32F-ZVFHMIN-NEXT: lh a2, 0(a2) +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 4, e16, m1, tu, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vslideup.vi v9, v10, 3 +; RV64ZVE32F-ZVFHMIN-NEXT: .LBB77_8: # %else8 ; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 16 -; RV64ZVE32F-ZVFHMIN-NEXT: bnez a2, .LBB77_16 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB77_7: # %else11 +; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB77_10 +; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.9: # %cond.load10 +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 +; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 +; RV64ZVE32F-ZVFHMIN-NEXT: lh a2, 0(a2) +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 5, e16, m1, tu, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vslideup.vi v9, v10, 4 +; RV64ZVE32F-ZVFHMIN-NEXT: .LBB77_10: # %else11 ; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 32 -; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB77_9 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB77_8: # %cond.load13 +; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB77_12 +; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.11: # %cond.load13 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v8, v10, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: lh a2, 0(a2) -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 6, e16, m1, tu, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslideup.vi v9, v8, 5 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB77_9: # %else14 +; RV64ZVE32F-ZVFHMIN-NEXT: vslideup.vi v9, v10, 5 +; RV64ZVE32F-ZVFHMIN-NEXT: .LBB77_12: # %else14 ; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 64 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v8, v10, 2 -; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB77_11 -; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.10: # %cond.load16 +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB77_14 +; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.13: # %cond.load16 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 @@ -9360,10 +9236,10 @@ define <8 x half> @mgather_baseidx_v8f16(ptr %base, <8 x i16> %idxs, <8 x i1> %m ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 7, e16, m1, tu, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslideup.vi v9, v10, 6 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB77_11: # %else17 +; RV64ZVE32F-ZVFHMIN-NEXT: .LBB77_14: # %else17 ; RV64ZVE32F-ZVFHMIN-NEXT: andi a1, a1, -128 -; RV64ZVE32F-ZVFHMIN-NEXT: beqz a1, .LBB77_13 -; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.12: # %cond.load19 +; RV64ZVE32F-ZVFHMIN-NEXT: beqz a1, .LBB77_16 +; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.15: # %cond.load19 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a1, v8 @@ -9373,43 +9249,10 @@ define <8 x half> @mgather_baseidx_v8f16(ptr %base, <8 x i16> %idxs, <8 x i1> %m ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslideup.vi v9, v8, 7 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB77_13: # %else20 +; RV64ZVE32F-ZVFHMIN-NEXT: .LBB77_16: # %else20 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vmv1r.v v8, v9 ; RV64ZVE32F-ZVFHMIN-NEXT: ret -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB77_14: # %cond.load4 -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 -; RV64ZVE32F-ZVFHMIN-NEXT: lh a2, 0(a2) -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.s.x v11, a2 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 3, e16, m1, tu, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslideup.vi v9, v11, 2 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 8 -; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB77_6 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB77_15: # %cond.load7 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 -; RV64ZVE32F-ZVFHMIN-NEXT: lh a2, 0(a2) -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.s.x v8, a2 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 4, e16, m1, tu, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslideup.vi v9, v8, 3 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 16 -; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB77_7 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB77_16: # %cond.load10 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 5, e16, m1, tu, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 -; RV64ZVE32F-ZVFHMIN-NEXT: lh a2, 0(a2) -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.s.x v8, a2 -; RV64ZVE32F-ZVFHMIN-NEXT: vslideup.vi v9, v8, 4 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 32 -; RV64ZVE32F-ZVFHMIN-NEXT: bnez a2, .LBB77_8 -; RV64ZVE32F-ZVFHMIN-NEXT: j .LBB77_9 %ptrs = getelementptr inbounds half, ptr %base, <8 x i16> %idxs %v = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> %ptrs, i32 2, <8 x i1> %m, <8 x half> %passthru) ret <8 x half> %v @@ -9789,25 +9632,54 @@ define <8 x float> @mgather_baseidx_v8i8_v8f32(ptr %base, <8 x i8> %idxs, <8 x i ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 1 ; RV64ZVE32F-NEXT: .LBB84_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB84_14 -; RV64ZVE32F-NEXT: # %bb.5: # %else5 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB84_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw fa5, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: .LBB84_6: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB84_15 -; RV64ZVE32F-NEXT: .LBB84_6: # %else8 +; RV64ZVE32F-NEXT: beqz a2, .LBB84_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw fa5, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vfmv.s.f v9, fa5 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 3 +; RV64ZVE32F-NEXT: .LBB84_8: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB84_16 -; RV64ZVE32F-NEXT: .LBB84_7: # %else11 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB84_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw fa5, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: .LBB84_10: # %else11 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB84_9 -; RV64ZVE32F-NEXT: .LBB84_8: # %cond.load13 +; RV64ZVE32F-NEXT: beqz a2, .LBB84_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) @@ -9815,12 +9687,12 @@ define <8 x float> @mgather_baseidx_v8i8_v8f32(ptr %base, <8 x i8> %idxs, <8 x i ; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 -; RV64ZVE32F-NEXT: .LBB84_9: # %else14 +; RV64ZVE32F-NEXT: .LBB84_12: # %else14 ; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB84_11 -; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB84_14 +; RV64ZVE32F-NEXT: # %bb.13: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -9829,10 +9701,10 @@ define <8 x float> @mgather_baseidx_v8i8_v8f32(ptr %base, <8 x i8> %idxs, <8 x i ; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 -; RV64ZVE32F-NEXT: .LBB84_11: # %else17 +; RV64ZVE32F-NEXT: .LBB84_14: # %else17 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB84_13 -; RV64ZVE32F-NEXT: # %bb.12: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a1, .LBB84_16 +; RV64ZVE32F-NEXT: # %bb.15: # %cond.load19 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v8 @@ -9843,47 +9715,10 @@ define <8 x float> @mgather_baseidx_v8i8_v8f32(ptr %base, <8 x i8> %idxs, <8 x i ; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 ; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 7 -; RV64ZVE32F-NEXT: .LBB84_13: # %else20 +; RV64ZVE32F-NEXT: .LBB84_16: # %else20 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vmv2r.v v8, v10 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB84_14: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB84_6 -; RV64ZVE32F-NEXT: .LBB84_15: # %cond.load7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 3 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB84_7 -; RV64ZVE32F-NEXT: .LBB84_16: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 -; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB84_8 -; RV64ZVE32F-NEXT: j .LBB84_9 %ptrs = getelementptr inbounds float, ptr %base, <8 x i8> %idxs %v = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> %m, <8 x float> %passthru) ret <8 x float> %v @@ -9937,50 +9772,79 @@ define <8 x float> @mgather_baseidx_sext_v8i8_v8f32(ptr %base, <8 x i8> %idxs, < ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 1 ; RV64ZVE32F-NEXT: .LBB85_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB85_14 -; RV64ZVE32F-NEXT: # %bb.5: # %else5 -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB85_15 -; RV64ZVE32F-NEXT: .LBB85_6: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB85_16 -; RV64ZVE32F-NEXT: .LBB85_7: # %else11 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB85_9 -; RV64ZVE32F-NEXT: .LBB85_8: # %cond.load13 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB85_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) ; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 -; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 -; RV64ZVE32F-NEXT: .LBB85_9: # %else14 -; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB85_11 -; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: .LBB85_6: # %else5 +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB85_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw fa5, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vfmv.s.f v9, fa5 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 3 +; RV64ZVE32F-NEXT: .LBB85_8: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB85_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw fa5, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: .LBB85_10: # %else11 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB85_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw fa5, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 +; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 +; RV64ZVE32F-NEXT: .LBB85_12: # %else14 +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB85_14 +; RV64ZVE32F-NEXT: # %bb.13: # %cond.load16 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) ; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 -; RV64ZVE32F-NEXT: .LBB85_11: # %else17 +; RV64ZVE32F-NEXT: .LBB85_14: # %else17 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB85_13 -; RV64ZVE32F-NEXT: # %bb.12: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a1, .LBB85_16 +; RV64ZVE32F-NEXT: # %bb.15: # %cond.load19 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v8 @@ -9991,47 +9855,10 @@ define <8 x float> @mgather_baseidx_sext_v8i8_v8f32(ptr %base, <8 x i8> %idxs, < ; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 ; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 7 -; RV64ZVE32F-NEXT: .LBB85_13: # %else20 +; RV64ZVE32F-NEXT: .LBB85_16: # %else20 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vmv2r.v v8, v10 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB85_14: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB85_6 -; RV64ZVE32F-NEXT: .LBB85_15: # %cond.load7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 3 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB85_7 -; RV64ZVE32F-NEXT: .LBB85_16: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 -; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB85_8 -; RV64ZVE32F-NEXT: j .LBB85_9 %eidxs = sext <8 x i8> %idxs to <8 x i32> %ptrs = getelementptr inbounds float, ptr %base, <8 x i32> %eidxs %v = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> %m, <8 x float> %passthru) @@ -10089,25 +9916,57 @@ define <8 x float> @mgather_baseidx_zext_v8i8_v8f32(ptr %base, <8 x i8> %idxs, < ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 1 ; RV64ZVE32F-NEXT: .LBB86_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB86_14 -; RV64ZVE32F-NEXT: # %bb.5: # %else5 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB86_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: zext.b a2, a2 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw fa5, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: .LBB86_6: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB86_15 -; RV64ZVE32F-NEXT: .LBB86_6: # %else8 +; RV64ZVE32F-NEXT: beqz a2, .LBB86_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: zext.b a2, a2 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw fa5, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vfmv.s.f v9, fa5 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 3 +; RV64ZVE32F-NEXT: .LBB86_8: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB86_16 -; RV64ZVE32F-NEXT: .LBB86_7: # %else11 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB86_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: zext.b a2, a2 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw fa5, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: .LBB86_10: # %else11 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB86_9 -; RV64ZVE32F-NEXT: .LBB86_8: # %cond.load13 +; RV64ZVE32F-NEXT: beqz a2, .LBB86_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: zext.b a2, a2 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -10116,12 +9975,12 @@ define <8 x float> @mgather_baseidx_zext_v8i8_v8f32(ptr %base, <8 x i8> %idxs, < ; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 -; RV64ZVE32F-NEXT: .LBB86_9: # %else14 +; RV64ZVE32F-NEXT: .LBB86_12: # %else14 ; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB86_11 -; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB86_14 +; RV64ZVE32F-NEXT: # %bb.13: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: zext.b a2, a2 ; RV64ZVE32F-NEXT: slli a2, a2, 2 @@ -10131,10 +9990,10 @@ define <8 x float> @mgather_baseidx_zext_v8i8_v8f32(ptr %base, <8 x i8> %idxs, < ; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 -; RV64ZVE32F-NEXT: .LBB86_11: # %else17 +; RV64ZVE32F-NEXT: .LBB86_14: # %else17 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB86_13 -; RV64ZVE32F-NEXT: # %bb.12: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a1, .LBB86_16 +; RV64ZVE32F-NEXT: # %bb.15: # %cond.load19 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v8 @@ -10146,50 +10005,10 @@ define <8 x float> @mgather_baseidx_zext_v8i8_v8f32(ptr %base, <8 x i8> %idxs, < ; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 ; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 7 -; RV64ZVE32F-NEXT: .LBB86_13: # %else20 +; RV64ZVE32F-NEXT: .LBB86_16: # %else20 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vmv2r.v v8, v10 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB86_14: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: zext.b a2, a2 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB86_6 -; RV64ZVE32F-NEXT: .LBB86_15: # %cond.load7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: zext.b a2, a2 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 3 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB86_7 -; RV64ZVE32F-NEXT: .LBB86_16: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: zext.b a2, a2 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 -; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB86_8 -; RV64ZVE32F-NEXT: j .LBB86_9 %eidxs = zext <8 x i8> %idxs to <8 x i32> %ptrs = getelementptr inbounds float, ptr %base, <8 x i32> %eidxs %v = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> %m, <8 x float> %passthru) @@ -10246,25 +10065,54 @@ define <8 x float> @mgather_baseidx_v8i16_v8f32(ptr %base, <8 x i16> %idxs, <8 x ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 1 ; RV64ZVE32F-NEXT: .LBB87_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB87_14 -; RV64ZVE32F-NEXT: # %bb.5: # %else5 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB87_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw fa5, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: .LBB87_6: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB87_15 -; RV64ZVE32F-NEXT: .LBB87_6: # %else8 +; RV64ZVE32F-NEXT: beqz a2, .LBB87_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw fa5, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vfmv.s.f v9, fa5 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 3 +; RV64ZVE32F-NEXT: .LBB87_8: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB87_16 -; RV64ZVE32F-NEXT: .LBB87_7: # %else11 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB87_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw fa5, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: .LBB87_10: # %else11 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB87_9 -; RV64ZVE32F-NEXT: .LBB87_8: # %cond.load13 +; RV64ZVE32F-NEXT: beqz a2, .LBB87_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) @@ -10272,12 +10120,12 @@ define <8 x float> @mgather_baseidx_v8i16_v8f32(ptr %base, <8 x i16> %idxs, <8 x ; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 -; RV64ZVE32F-NEXT: .LBB87_9: # %else14 +; RV64ZVE32F-NEXT: .LBB87_12: # %else14 ; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB87_11 -; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB87_14 +; RV64ZVE32F-NEXT: # %bb.13: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -10286,10 +10134,10 @@ define <8 x float> @mgather_baseidx_v8i16_v8f32(ptr %base, <8 x i16> %idxs, <8 x ; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 -; RV64ZVE32F-NEXT: .LBB87_11: # %else17 +; RV64ZVE32F-NEXT: .LBB87_14: # %else17 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB87_13 -; RV64ZVE32F-NEXT: # %bb.12: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a1, .LBB87_16 +; RV64ZVE32F-NEXT: # %bb.15: # %cond.load19 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v8 @@ -10300,47 +10148,10 @@ define <8 x float> @mgather_baseidx_v8i16_v8f32(ptr %base, <8 x i16> %idxs, <8 x ; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 ; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 7 -; RV64ZVE32F-NEXT: .LBB87_13: # %else20 +; RV64ZVE32F-NEXT: .LBB87_16: # %else20 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vmv2r.v v8, v10 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB87_14: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB87_6 -; RV64ZVE32F-NEXT: .LBB87_15: # %cond.load7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 3 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB87_7 -; RV64ZVE32F-NEXT: .LBB87_16: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 -; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB87_8 -; RV64ZVE32F-NEXT: j .LBB87_9 %ptrs = getelementptr inbounds float, ptr %base, <8 x i16> %idxs %v = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> %m, <8 x float> %passthru) ret <8 x float> %v @@ -10396,25 +10207,54 @@ define <8 x float> @mgather_baseidx_sext_v8i16_v8f32(ptr %base, <8 x i16> %idxs, ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 1 ; RV64ZVE32F-NEXT: .LBB88_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB88_14 -; RV64ZVE32F-NEXT: # %bb.5: # %else5 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB88_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw fa5, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: .LBB88_6: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB88_15 -; RV64ZVE32F-NEXT: .LBB88_6: # %else8 +; RV64ZVE32F-NEXT: beqz a2, .LBB88_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw fa5, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vfmv.s.f v9, fa5 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 3 +; RV64ZVE32F-NEXT: .LBB88_8: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB88_16 -; RV64ZVE32F-NEXT: .LBB88_7: # %else11 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB88_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw fa5, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: .LBB88_10: # %else11 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB88_9 -; RV64ZVE32F-NEXT: .LBB88_8: # %cond.load13 +; RV64ZVE32F-NEXT: beqz a2, .LBB88_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) @@ -10422,12 +10262,12 @@ define <8 x float> @mgather_baseidx_sext_v8i16_v8f32(ptr %base, <8 x i16> %idxs, ; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 -; RV64ZVE32F-NEXT: .LBB88_9: # %else14 +; RV64ZVE32F-NEXT: .LBB88_12: # %else14 ; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB88_11 -; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB88_14 +; RV64ZVE32F-NEXT: # %bb.13: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -10436,10 +10276,10 @@ define <8 x float> @mgather_baseidx_sext_v8i16_v8f32(ptr %base, <8 x i16> %idxs, ; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 -; RV64ZVE32F-NEXT: .LBB88_11: # %else17 +; RV64ZVE32F-NEXT: .LBB88_14: # %else17 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB88_13 -; RV64ZVE32F-NEXT: # %bb.12: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a1, .LBB88_16 +; RV64ZVE32F-NEXT: # %bb.15: # %cond.load19 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v8 @@ -10450,47 +10290,10 @@ define <8 x float> @mgather_baseidx_sext_v8i16_v8f32(ptr %base, <8 x i16> %idxs, ; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 ; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 7 -; RV64ZVE32F-NEXT: .LBB88_13: # %else20 +; RV64ZVE32F-NEXT: .LBB88_16: # %else20 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vmv2r.v v8, v10 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB88_14: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB88_6 -; RV64ZVE32F-NEXT: .LBB88_15: # %cond.load7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 3 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB88_7 -; RV64ZVE32F-NEXT: .LBB88_16: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 -; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB88_8 -; RV64ZVE32F-NEXT: j .LBB88_9 %eidxs = sext <8 x i16> %idxs to <8 x i32> %ptrs = getelementptr inbounds float, ptr %base, <8 x i32> %eidxs %v = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> %m, <8 x float> %passthru) @@ -10549,25 +10352,57 @@ define <8 x float> @mgather_baseidx_zext_v8i16_v8f32(ptr %base, <8 x i16> %idxs, ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 1 ; RV64ZVE32F-NEXT: .LBB89_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB89_14 -; RV64ZVE32F-NEXT: # %bb.5: # %else5 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB89_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 48 +; RV64ZVE32F-NEXT: srli a2, a2, 46 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw fa5, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: .LBB89_6: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB89_15 -; RV64ZVE32F-NEXT: .LBB89_6: # %else8 +; RV64ZVE32F-NEXT: beqz a2, .LBB89_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 48 +; RV64ZVE32F-NEXT: srli a2, a2, 46 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw fa5, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vfmv.s.f v9, fa5 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 3 +; RV64ZVE32F-NEXT: .LBB89_8: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB89_16 -; RV64ZVE32F-NEXT: .LBB89_7: # %else11 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB89_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 48 +; RV64ZVE32F-NEXT: srli a2, a2, 46 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw fa5, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: .LBB89_10: # %else11 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB89_9 -; RV64ZVE32F-NEXT: .LBB89_8: # %cond.load13 +; RV64ZVE32F-NEXT: beqz a2, .LBB89_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 48 ; RV64ZVE32F-NEXT: srli a2, a2, 46 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -10576,12 +10411,12 @@ define <8 x float> @mgather_baseidx_zext_v8i16_v8f32(ptr %base, <8 x i16> %idxs, ; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 -; RV64ZVE32F-NEXT: .LBB89_9: # %else14 +; RV64ZVE32F-NEXT: .LBB89_12: # %else14 ; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB89_11 -; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB89_14 +; RV64ZVE32F-NEXT: # %bb.13: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 48 ; RV64ZVE32F-NEXT: srli a2, a2, 46 @@ -10591,10 +10426,10 @@ define <8 x float> @mgather_baseidx_zext_v8i16_v8f32(ptr %base, <8 x i16> %idxs, ; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 -; RV64ZVE32F-NEXT: .LBB89_11: # %else17 +; RV64ZVE32F-NEXT: .LBB89_14: # %else17 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB89_13 -; RV64ZVE32F-NEXT: # %bb.12: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a1, .LBB89_16 +; RV64ZVE32F-NEXT: # %bb.15: # %cond.load19 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v8 @@ -10606,50 +10441,10 @@ define <8 x float> @mgather_baseidx_zext_v8i16_v8f32(ptr %base, <8 x i16> %idxs, ; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 ; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 7 -; RV64ZVE32F-NEXT: .LBB89_13: # %else20 +; RV64ZVE32F-NEXT: .LBB89_16: # %else20 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vmv2r.v v8, v10 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB89_14: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: slli a2, a2, 48 -; RV64ZVE32F-NEXT: srli a2, a2, 46 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB89_6 -; RV64ZVE32F-NEXT: .LBB89_15: # %cond.load7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: slli a2, a2, 48 -; RV64ZVE32F-NEXT: srli a2, a2, 46 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 3 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB89_7 -; RV64ZVE32F-NEXT: .LBB89_16: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 48 -; RV64ZVE32F-NEXT: srli a2, a2, 46 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 -; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB89_8 -; RV64ZVE32F-NEXT: j .LBB89_9 %eidxs = zext <8 x i16> %idxs to <8 x i32> %ptrs = getelementptr inbounds float, ptr %base, <8 x i32> %eidxs %v = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> %m, <8 x float> %passthru) @@ -10700,37 +10495,45 @@ define <8 x float> @mgather_baseidx_v8f32(ptr %base, <8 x i32> %idxs, <8 x i1> % ; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 ; RV64ZVE32F-NEXT: .LBB90_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB90_14 -; RV64ZVE32F-NEXT: # %bb.5: # %else5 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB90_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw fa5, 0(a2) +; RV64ZVE32F-NEXT: vfmv.s.f v13, fa5 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v13, 2 +; RV64ZVE32F-NEXT: .LBB90_6: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB90_15 -; RV64ZVE32F-NEXT: .LBB90_6: # %else8 +; RV64ZVE32F-NEXT: # %bb.7: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: bnez a2, .LBB90_16 -; RV64ZVE32F-NEXT: .LBB90_7: # %else11 +; RV64ZVE32F-NEXT: .LBB90_8: # %else11 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB90_9 -; RV64ZVE32F-NEXT: .LBB90_8: # %cond.load13 +; RV64ZVE32F-NEXT: beqz a2, .LBB90_10 +; RV64ZVE32F-NEXT: .LBB90_9: # %cond.load13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v12, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 +; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 5 -; RV64ZVE32F-NEXT: .LBB90_9: # %else14 +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 +; RV64ZVE32F-NEXT: .LBB90_10: # %else14 ; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v12, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB90_11 -; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB90_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -10738,10 +10541,10 @@ define <8 x float> @mgather_baseidx_v8f32(ptr %base, <8 x i32> %idxs, <8 x i1> % ; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 ; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 -; RV64ZVE32F-NEXT: .LBB90_11: # %else17 +; RV64ZVE32F-NEXT: .LBB90_12: # %else17 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB90_13 -; RV64ZVE32F-NEXT: # %bb.12: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a1, .LBB90_14 +; RV64ZVE32F-NEXT: # %bb.13: # %cond.load19 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v8 @@ -10751,42 +10554,32 @@ define <8 x float> @mgather_baseidx_v8f32(ptr %base, <8 x i32> %idxs, <8 x i1> % ; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 ; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 7 -; RV64ZVE32F-NEXT: .LBB90_13: # %else20 +; RV64ZVE32F-NEXT: .LBB90_14: # %else20 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vmv2r.v v8, v10 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB90_14: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vfmv.s.f v9, fa5 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 2 -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB90_6 ; RV64ZVE32F-NEXT: .LBB90_15: # %cond.load7 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 3 +; RV64ZVE32F-NEXT: vfmv.s.f v9, fa5 +; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 3 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB90_7 +; RV64ZVE32F-NEXT: beqz a2, .LBB90_8 ; RV64ZVE32F-NEXT: .LBB90_16: # %cond.load10 ; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 4 +; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB90_8 -; RV64ZVE32F-NEXT: j .LBB90_9 +; RV64ZVE32F-NEXT: bnez a2, .LBB90_9 +; RV64ZVE32F-NEXT: j .LBB90_10 %ptrs = getelementptr inbounds float, ptr %base, <8 x i32> %idxs %v = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> %m, <8 x float> %passthru) ret <8 x float> %v @@ -11418,46 +11211,64 @@ define <8 x double> @mgather_baseidx_v8i8_v8f64(ptr %base, <8 x i8> %idxs, <8 x ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa1, 0(a3) ; RV64ZVE32F-NEXT: .LBB97_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: andi a3, a2, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a3, .LBB97_14 -; RV64ZVE32F-NEXT: # %bb.5: # %else5 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB97_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa2, 0(a3) +; RV64ZVE32F-NEXT: .LBB97_6: # %else5 ; RV64ZVE32F-NEXT: andi a3, a2, 8 -; RV64ZVE32F-NEXT: bnez a3, .LBB97_15 -; RV64ZVE32F-NEXT: .LBB97_6: # %else8 +; RV64ZVE32F-NEXT: beqz a3, .LBB97_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa3, 0(a3) +; RV64ZVE32F-NEXT: .LBB97_8: # %else8 ; RV64ZVE32F-NEXT: andi a3, a2, 16 -; RV64ZVE32F-NEXT: bnez a3, .LBB97_16 -; RV64ZVE32F-NEXT: .LBB97_7: # %else11 -; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: beqz a3, .LBB97_9 -; RV64ZVE32F-NEXT: .LBB97_8: # %cond.load13 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz a3, .LBB97_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa4, 0(a3) +; RV64ZVE32F-NEXT: .LBB97_10: # %else11 +; RV64ZVE32F-NEXT: andi a3, a2, 32 +; RV64ZVE32F-NEXT: beqz a3, .LBB97_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa5, 0(a3) -; RV64ZVE32F-NEXT: .LBB97_9: # %else14 +; RV64ZVE32F-NEXT: .LBB97_12: # %else14 ; RV64ZVE32F-NEXT: andi a3, a2, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 -; RV64ZVE32F-NEXT: beqz a3, .LBB97_11 -; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB97_14 +; RV64ZVE32F-NEXT: # %bb.13: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa6, 0(a3) -; RV64ZVE32F-NEXT: .LBB97_11: # %else17 +; RV64ZVE32F-NEXT: .LBB97_14: # %else17 ; RV64ZVE32F-NEXT: andi a2, a2, -128 -; RV64ZVE32F-NEXT: beqz a2, .LBB97_13 -; RV64ZVE32F-NEXT: # %bb.12: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a2, .LBB97_16 +; RV64ZVE32F-NEXT: # %bb.15: # %cond.load19 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: fld fa7, 0(a1) -; RV64ZVE32F-NEXT: .LBB97_13: # %else20 +; RV64ZVE32F-NEXT: .LBB97_16: # %else20 ; RV64ZVE32F-NEXT: fsd fa0, 0(a0) ; RV64ZVE32F-NEXT: fsd fa1, 8(a0) ; RV64ZVE32F-NEXT: fsd fa2, 16(a0) @@ -11467,29 +11278,6 @@ define <8 x double> @mgather_baseidx_v8i8_v8f64(ptr %base, <8 x i8> %idxs, <8 x ; RV64ZVE32F-NEXT: fsd fa6, 48(a0) ; RV64ZVE32F-NEXT: fsd fa7, 56(a0) ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB97_14: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 -; RV64ZVE32F-NEXT: slli a3, a3, 3 -; RV64ZVE32F-NEXT: add a3, a1, a3 -; RV64ZVE32F-NEXT: fld fa2, 0(a3) -; RV64ZVE32F-NEXT: andi a3, a2, 8 -; RV64ZVE32F-NEXT: beqz a3, .LBB97_6 -; RV64ZVE32F-NEXT: .LBB97_15: # %cond.load7 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 -; RV64ZVE32F-NEXT: slli a3, a3, 3 -; RV64ZVE32F-NEXT: add a3, a1, a3 -; RV64ZVE32F-NEXT: fld fa3, 0(a3) -; RV64ZVE32F-NEXT: andi a3, a2, 16 -; RV64ZVE32F-NEXT: beqz a3, .LBB97_7 -; RV64ZVE32F-NEXT: .LBB97_16: # %cond.load10 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 -; RV64ZVE32F-NEXT: slli a3, a3, 3 -; RV64ZVE32F-NEXT: add a3, a1, a3 -; RV64ZVE32F-NEXT: fld fa4, 0(a3) -; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: bnez a3, .LBB97_8 -; RV64ZVE32F-NEXT: j .LBB97_9 %ptrs = getelementptr inbounds double, ptr %base, <8 x i8> %idxs %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> %ptrs, i32 8, <8 x i1> %m, <8 x double> %passthru) ret <8 x double> %v @@ -11633,46 +11421,64 @@ define <8 x double> @mgather_baseidx_sext_v8i8_v8f64(ptr %base, <8 x i8> %idxs, ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa1, 0(a3) ; RV64ZVE32F-NEXT: .LBB98_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: andi a3, a2, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a3, .LBB98_14 -; RV64ZVE32F-NEXT: # %bb.5: # %else5 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB98_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa2, 0(a3) +; RV64ZVE32F-NEXT: .LBB98_6: # %else5 ; RV64ZVE32F-NEXT: andi a3, a2, 8 -; RV64ZVE32F-NEXT: bnez a3, .LBB98_15 -; RV64ZVE32F-NEXT: .LBB98_6: # %else8 +; RV64ZVE32F-NEXT: beqz a3, .LBB98_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa3, 0(a3) +; RV64ZVE32F-NEXT: .LBB98_8: # %else8 ; RV64ZVE32F-NEXT: andi a3, a2, 16 -; RV64ZVE32F-NEXT: bnez a3, .LBB98_16 -; RV64ZVE32F-NEXT: .LBB98_7: # %else11 -; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: beqz a3, .LBB98_9 -; RV64ZVE32F-NEXT: .LBB98_8: # %cond.load13 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz a3, .LBB98_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa4, 0(a3) +; RV64ZVE32F-NEXT: .LBB98_10: # %else11 +; RV64ZVE32F-NEXT: andi a3, a2, 32 +; RV64ZVE32F-NEXT: beqz a3, .LBB98_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa5, 0(a3) -; RV64ZVE32F-NEXT: .LBB98_9: # %else14 +; RV64ZVE32F-NEXT: .LBB98_12: # %else14 ; RV64ZVE32F-NEXT: andi a3, a2, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 -; RV64ZVE32F-NEXT: beqz a3, .LBB98_11 -; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB98_14 +; RV64ZVE32F-NEXT: # %bb.13: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa6, 0(a3) -; RV64ZVE32F-NEXT: .LBB98_11: # %else17 +; RV64ZVE32F-NEXT: .LBB98_14: # %else17 ; RV64ZVE32F-NEXT: andi a2, a2, -128 -; RV64ZVE32F-NEXT: beqz a2, .LBB98_13 -; RV64ZVE32F-NEXT: # %bb.12: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_16 +; RV64ZVE32F-NEXT: # %bb.15: # %cond.load19 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: fld fa7, 0(a1) -; RV64ZVE32F-NEXT: .LBB98_13: # %else20 +; RV64ZVE32F-NEXT: .LBB98_16: # %else20 ; RV64ZVE32F-NEXT: fsd fa0, 0(a0) ; RV64ZVE32F-NEXT: fsd fa1, 8(a0) ; RV64ZVE32F-NEXT: fsd fa2, 16(a0) @@ -11682,29 +11488,6 @@ define <8 x double> @mgather_baseidx_sext_v8i8_v8f64(ptr %base, <8 x i8> %idxs, ; RV64ZVE32F-NEXT: fsd fa6, 48(a0) ; RV64ZVE32F-NEXT: fsd fa7, 56(a0) ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB98_14: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 -; RV64ZVE32F-NEXT: slli a3, a3, 3 -; RV64ZVE32F-NEXT: add a3, a1, a3 -; RV64ZVE32F-NEXT: fld fa2, 0(a3) -; RV64ZVE32F-NEXT: andi a3, a2, 8 -; RV64ZVE32F-NEXT: beqz a3, .LBB98_6 -; RV64ZVE32F-NEXT: .LBB98_15: # %cond.load7 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 -; RV64ZVE32F-NEXT: slli a3, a3, 3 -; RV64ZVE32F-NEXT: add a3, a1, a3 -; RV64ZVE32F-NEXT: fld fa3, 0(a3) -; RV64ZVE32F-NEXT: andi a3, a2, 16 -; RV64ZVE32F-NEXT: beqz a3, .LBB98_7 -; RV64ZVE32F-NEXT: .LBB98_16: # %cond.load10 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 -; RV64ZVE32F-NEXT: slli a3, a3, 3 -; RV64ZVE32F-NEXT: add a3, a1, a3 -; RV64ZVE32F-NEXT: fld fa4, 0(a3) -; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: bnez a3, .LBB98_8 -; RV64ZVE32F-NEXT: j .LBB98_9 %eidxs = sext <8 x i8> %idxs to <8 x i64> %ptrs = getelementptr inbounds double, ptr %base, <8 x i64> %eidxs %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> %ptrs, i32 8, <8 x i1> %m, <8 x double> %passthru) @@ -11852,49 +11635,70 @@ define <8 x double> @mgather_baseidx_zext_v8i8_v8f64(ptr %base, <8 x i8> %idxs, ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa1, 0(a3) ; RV64ZVE32F-NEXT: .LBB99_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: andi a3, a2, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a3, .LBB99_14 -; RV64ZVE32F-NEXT: # %bb.5: # %else5 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB99_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: zext.b a3, a3 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa2, 0(a3) +; RV64ZVE32F-NEXT: .LBB99_6: # %else5 ; RV64ZVE32F-NEXT: andi a3, a2, 8 -; RV64ZVE32F-NEXT: bnez a3, .LBB99_15 -; RV64ZVE32F-NEXT: .LBB99_6: # %else8 +; RV64ZVE32F-NEXT: beqz a3, .LBB99_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: zext.b a3, a3 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa3, 0(a3) +; RV64ZVE32F-NEXT: .LBB99_8: # %else8 ; RV64ZVE32F-NEXT: andi a3, a2, 16 -; RV64ZVE32F-NEXT: bnez a3, .LBB99_16 -; RV64ZVE32F-NEXT: .LBB99_7: # %else11 -; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: beqz a3, .LBB99_9 -; RV64ZVE32F-NEXT: .LBB99_8: # %cond.load13 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz a3, .LBB99_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: zext.b a3, a3 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa4, 0(a3) +; RV64ZVE32F-NEXT: .LBB99_10: # %else11 +; RV64ZVE32F-NEXT: andi a3, a2, 32 +; RV64ZVE32F-NEXT: beqz a3, .LBB99_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: zext.b a3, a3 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa5, 0(a3) -; RV64ZVE32F-NEXT: .LBB99_9: # %else14 +; RV64ZVE32F-NEXT: .LBB99_12: # %else14 ; RV64ZVE32F-NEXT: andi a3, a2, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 -; RV64ZVE32F-NEXT: beqz a3, .LBB99_11 -; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB99_14 +; RV64ZVE32F-NEXT: # %bb.13: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: zext.b a3, a3 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa6, 0(a3) -; RV64ZVE32F-NEXT: .LBB99_11: # %else17 +; RV64ZVE32F-NEXT: .LBB99_14: # %else17 ; RV64ZVE32F-NEXT: andi a2, a2, -128 -; RV64ZVE32F-NEXT: beqz a2, .LBB99_13 -; RV64ZVE32F-NEXT: # %bb.12: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a2, .LBB99_16 +; RV64ZVE32F-NEXT: # %bb.15: # %cond.load19 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: zext.b a2, a2 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: fld fa7, 0(a1) -; RV64ZVE32F-NEXT: .LBB99_13: # %else20 +; RV64ZVE32F-NEXT: .LBB99_16: # %else20 ; RV64ZVE32F-NEXT: fsd fa0, 0(a0) ; RV64ZVE32F-NEXT: fsd fa1, 8(a0) ; RV64ZVE32F-NEXT: fsd fa2, 16(a0) @@ -11904,32 +11708,6 @@ define <8 x double> @mgather_baseidx_zext_v8i8_v8f64(ptr %base, <8 x i8> %idxs, ; RV64ZVE32F-NEXT: fsd fa6, 48(a0) ; RV64ZVE32F-NEXT: fsd fa7, 56(a0) ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB99_14: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 -; RV64ZVE32F-NEXT: zext.b a3, a3 -; RV64ZVE32F-NEXT: slli a3, a3, 3 -; RV64ZVE32F-NEXT: add a3, a1, a3 -; RV64ZVE32F-NEXT: fld fa2, 0(a3) -; RV64ZVE32F-NEXT: andi a3, a2, 8 -; RV64ZVE32F-NEXT: beqz a3, .LBB99_6 -; RV64ZVE32F-NEXT: .LBB99_15: # %cond.load7 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 -; RV64ZVE32F-NEXT: zext.b a3, a3 -; RV64ZVE32F-NEXT: slli a3, a3, 3 -; RV64ZVE32F-NEXT: add a3, a1, a3 -; RV64ZVE32F-NEXT: fld fa3, 0(a3) -; RV64ZVE32F-NEXT: andi a3, a2, 16 -; RV64ZVE32F-NEXT: beqz a3, .LBB99_7 -; RV64ZVE32F-NEXT: .LBB99_16: # %cond.load10 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 -; RV64ZVE32F-NEXT: zext.b a3, a3 -; RV64ZVE32F-NEXT: slli a3, a3, 3 -; RV64ZVE32F-NEXT: add a3, a1, a3 -; RV64ZVE32F-NEXT: fld fa4, 0(a3) -; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: bnez a3, .LBB99_8 -; RV64ZVE32F-NEXT: j .LBB99_9 %eidxs = zext <8 x i8> %idxs to <8 x i64> %ptrs = getelementptr inbounds double, ptr %base, <8 x i64> %eidxs %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> %ptrs, i32 8, <8 x i1> %m, <8 x double> %passthru) @@ -12076,46 +11854,64 @@ define <8 x double> @mgather_baseidx_v8i16_v8f64(ptr %base, <8 x i16> %idxs, <8 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa1, 0(a3) ; RV64ZVE32F-NEXT: .LBB100_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: andi a3, a2, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a3, .LBB100_14 -; RV64ZVE32F-NEXT: # %bb.5: # %else5 -; RV64ZVE32F-NEXT: andi a3, a2, 8 -; RV64ZVE32F-NEXT: bnez a3, .LBB100_15 -; RV64ZVE32F-NEXT: .LBB100_6: # %else8 -; RV64ZVE32F-NEXT: andi a3, a2, 16 -; RV64ZVE32F-NEXT: bnez a3, .LBB100_16 -; RV64ZVE32F-NEXT: .LBB100_7: # %else11 -; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: beqz a3, .LBB100_9 -; RV64ZVE32F-NEXT: .LBB100_8: # %cond.load13 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB100_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 -; RV64ZVE32F-NEXT: fld fa5, 0(a3) -; RV64ZVE32F-NEXT: .LBB100_9: # %else14 -; RV64ZVE32F-NEXT: andi a3, a2, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 -; RV64ZVE32F-NEXT: beqz a3, .LBB100_11 -; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 +; RV64ZVE32F-NEXT: fld fa2, 0(a3) +; RV64ZVE32F-NEXT: .LBB100_6: # %else5 +; RV64ZVE32F-NEXT: andi a3, a2, 8 +; RV64ZVE32F-NEXT: beqz a3, .LBB100_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa3, 0(a3) +; RV64ZVE32F-NEXT: .LBB100_8: # %else8 +; RV64ZVE32F-NEXT: andi a3, a2, 16 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz a3, .LBB100_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa4, 0(a3) +; RV64ZVE32F-NEXT: .LBB100_10: # %else11 +; RV64ZVE32F-NEXT: andi a3, a2, 32 +; RV64ZVE32F-NEXT: beqz a3, .LBB100_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa5, 0(a3) +; RV64ZVE32F-NEXT: .LBB100_12: # %else14 +; RV64ZVE32F-NEXT: andi a3, a2, 64 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB100_14 +; RV64ZVE32F-NEXT: # %bb.13: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa6, 0(a3) -; RV64ZVE32F-NEXT: .LBB100_11: # %else17 +; RV64ZVE32F-NEXT: .LBB100_14: # %else17 ; RV64ZVE32F-NEXT: andi a2, a2, -128 -; RV64ZVE32F-NEXT: beqz a2, .LBB100_13 -; RV64ZVE32F-NEXT: # %bb.12: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a2, .LBB100_16 +; RV64ZVE32F-NEXT: # %bb.15: # %cond.load19 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: fld fa7, 0(a1) -; RV64ZVE32F-NEXT: .LBB100_13: # %else20 +; RV64ZVE32F-NEXT: .LBB100_16: # %else20 ; RV64ZVE32F-NEXT: fsd fa0, 0(a0) ; RV64ZVE32F-NEXT: fsd fa1, 8(a0) ; RV64ZVE32F-NEXT: fsd fa2, 16(a0) @@ -12125,29 +11921,6 @@ define <8 x double> @mgather_baseidx_v8i16_v8f64(ptr %base, <8 x i16> %idxs, <8 ; RV64ZVE32F-NEXT: fsd fa6, 48(a0) ; RV64ZVE32F-NEXT: fsd fa7, 56(a0) ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB100_14: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 -; RV64ZVE32F-NEXT: slli a3, a3, 3 -; RV64ZVE32F-NEXT: add a3, a1, a3 -; RV64ZVE32F-NEXT: fld fa2, 0(a3) -; RV64ZVE32F-NEXT: andi a3, a2, 8 -; RV64ZVE32F-NEXT: beqz a3, .LBB100_6 -; RV64ZVE32F-NEXT: .LBB100_15: # %cond.load7 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 -; RV64ZVE32F-NEXT: slli a3, a3, 3 -; RV64ZVE32F-NEXT: add a3, a1, a3 -; RV64ZVE32F-NEXT: fld fa3, 0(a3) -; RV64ZVE32F-NEXT: andi a3, a2, 16 -; RV64ZVE32F-NEXT: beqz a3, .LBB100_7 -; RV64ZVE32F-NEXT: .LBB100_16: # %cond.load10 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 -; RV64ZVE32F-NEXT: slli a3, a3, 3 -; RV64ZVE32F-NEXT: add a3, a1, a3 -; RV64ZVE32F-NEXT: fld fa4, 0(a3) -; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: bnez a3, .LBB100_8 -; RV64ZVE32F-NEXT: j .LBB100_9 %ptrs = getelementptr inbounds double, ptr %base, <8 x i16> %idxs %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> %ptrs, i32 8, <8 x i1> %m, <8 x double> %passthru) ret <8 x double> %v @@ -12293,46 +12066,64 @@ define <8 x double> @mgather_baseidx_sext_v8i16_v8f64(ptr %base, <8 x i16> %idxs ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa1, 0(a3) ; RV64ZVE32F-NEXT: .LBB101_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: andi a3, a2, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a3, .LBB101_14 -; RV64ZVE32F-NEXT: # %bb.5: # %else5 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB101_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa2, 0(a3) +; RV64ZVE32F-NEXT: .LBB101_6: # %else5 ; RV64ZVE32F-NEXT: andi a3, a2, 8 -; RV64ZVE32F-NEXT: bnez a3, .LBB101_15 -; RV64ZVE32F-NEXT: .LBB101_6: # %else8 +; RV64ZVE32F-NEXT: beqz a3, .LBB101_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa3, 0(a3) +; RV64ZVE32F-NEXT: .LBB101_8: # %else8 ; RV64ZVE32F-NEXT: andi a3, a2, 16 -; RV64ZVE32F-NEXT: bnez a3, .LBB101_16 -; RV64ZVE32F-NEXT: .LBB101_7: # %else11 -; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: beqz a3, .LBB101_9 -; RV64ZVE32F-NEXT: .LBB101_8: # %cond.load13 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz a3, .LBB101_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa4, 0(a3) +; RV64ZVE32F-NEXT: .LBB101_10: # %else11 +; RV64ZVE32F-NEXT: andi a3, a2, 32 +; RV64ZVE32F-NEXT: beqz a3, .LBB101_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa5, 0(a3) -; RV64ZVE32F-NEXT: .LBB101_9: # %else14 +; RV64ZVE32F-NEXT: .LBB101_12: # %else14 ; RV64ZVE32F-NEXT: andi a3, a2, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 -; RV64ZVE32F-NEXT: beqz a3, .LBB101_11 -; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB101_14 +; RV64ZVE32F-NEXT: # %bb.13: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa6, 0(a3) -; RV64ZVE32F-NEXT: .LBB101_11: # %else17 +; RV64ZVE32F-NEXT: .LBB101_14: # %else17 ; RV64ZVE32F-NEXT: andi a2, a2, -128 -; RV64ZVE32F-NEXT: beqz a2, .LBB101_13 -; RV64ZVE32F-NEXT: # %bb.12: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a2, .LBB101_16 +; RV64ZVE32F-NEXT: # %bb.15: # %cond.load19 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: fld fa7, 0(a1) -; RV64ZVE32F-NEXT: .LBB101_13: # %else20 +; RV64ZVE32F-NEXT: .LBB101_16: # %else20 ; RV64ZVE32F-NEXT: fsd fa0, 0(a0) ; RV64ZVE32F-NEXT: fsd fa1, 8(a0) ; RV64ZVE32F-NEXT: fsd fa2, 16(a0) @@ -12342,29 +12133,6 @@ define <8 x double> @mgather_baseidx_sext_v8i16_v8f64(ptr %base, <8 x i16> %idxs ; RV64ZVE32F-NEXT: fsd fa6, 48(a0) ; RV64ZVE32F-NEXT: fsd fa7, 56(a0) ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB101_14: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 -; RV64ZVE32F-NEXT: slli a3, a3, 3 -; RV64ZVE32F-NEXT: add a3, a1, a3 -; RV64ZVE32F-NEXT: fld fa2, 0(a3) -; RV64ZVE32F-NEXT: andi a3, a2, 8 -; RV64ZVE32F-NEXT: beqz a3, .LBB101_6 -; RV64ZVE32F-NEXT: .LBB101_15: # %cond.load7 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 -; RV64ZVE32F-NEXT: slli a3, a3, 3 -; RV64ZVE32F-NEXT: add a3, a1, a3 -; RV64ZVE32F-NEXT: fld fa3, 0(a3) -; RV64ZVE32F-NEXT: andi a3, a2, 16 -; RV64ZVE32F-NEXT: beqz a3, .LBB101_7 -; RV64ZVE32F-NEXT: .LBB101_16: # %cond.load10 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 -; RV64ZVE32F-NEXT: slli a3, a3, 3 -; RV64ZVE32F-NEXT: add a3, a1, a3 -; RV64ZVE32F-NEXT: fld fa4, 0(a3) -; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: bnez a3, .LBB101_8 -; RV64ZVE32F-NEXT: j .LBB101_9 %eidxs = sext <8 x i16> %idxs to <8 x i64> %ptrs = getelementptr inbounds double, ptr %base, <8 x i64> %eidxs %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> %ptrs, i32 8, <8 x i1> %m, <8 x double> %passthru) @@ -12514,49 +12282,70 @@ define <8 x double> @mgather_baseidx_zext_v8i16_v8f64(ptr %base, <8 x i16> %idxs ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa1, 0(a3) ; RV64ZVE32F-NEXT: .LBB102_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: andi a3, a2, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a3, .LBB102_14 -; RV64ZVE32F-NEXT: # %bb.5: # %else5 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB102_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: slli a3, a3, 48 +; RV64ZVE32F-NEXT: srli a3, a3, 45 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa2, 0(a3) +; RV64ZVE32F-NEXT: .LBB102_6: # %else5 ; RV64ZVE32F-NEXT: andi a3, a2, 8 -; RV64ZVE32F-NEXT: bnez a3, .LBB102_15 -; RV64ZVE32F-NEXT: .LBB102_6: # %else8 +; RV64ZVE32F-NEXT: beqz a3, .LBB102_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: slli a3, a3, 48 +; RV64ZVE32F-NEXT: srli a3, a3, 45 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa3, 0(a3) +; RV64ZVE32F-NEXT: .LBB102_8: # %else8 ; RV64ZVE32F-NEXT: andi a3, a2, 16 -; RV64ZVE32F-NEXT: bnez a3, .LBB102_16 -; RV64ZVE32F-NEXT: .LBB102_7: # %else11 -; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: beqz a3, .LBB102_9 -; RV64ZVE32F-NEXT: .LBB102_8: # %cond.load13 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz a3, .LBB102_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 48 ; RV64ZVE32F-NEXT: srli a3, a3, 45 ; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa4, 0(a3) +; RV64ZVE32F-NEXT: .LBB102_10: # %else11 +; RV64ZVE32F-NEXT: andi a3, a2, 32 +; RV64ZVE32F-NEXT: beqz a3, .LBB102_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: slli a3, a3, 48 +; RV64ZVE32F-NEXT: srli a3, a3, 45 +; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa5, 0(a3) -; RV64ZVE32F-NEXT: .LBB102_9: # %else14 +; RV64ZVE32F-NEXT: .LBB102_12: # %else14 ; RV64ZVE32F-NEXT: andi a3, a2, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 -; RV64ZVE32F-NEXT: beqz a3, .LBB102_11 -; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB102_14 +; RV64ZVE32F-NEXT: # %bb.13: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 48 ; RV64ZVE32F-NEXT: srli a3, a3, 45 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa6, 0(a3) -; RV64ZVE32F-NEXT: .LBB102_11: # %else17 +; RV64ZVE32F-NEXT: .LBB102_14: # %else17 ; RV64ZVE32F-NEXT: andi a2, a2, -128 -; RV64ZVE32F-NEXT: beqz a2, .LBB102_13 -; RV64ZVE32F-NEXT: # %bb.12: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a2, .LBB102_16 +; RV64ZVE32F-NEXT: # %bb.15: # %cond.load19 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 48 ; RV64ZVE32F-NEXT: srli a2, a2, 45 ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: fld fa7, 0(a1) -; RV64ZVE32F-NEXT: .LBB102_13: # %else20 +; RV64ZVE32F-NEXT: .LBB102_16: # %else20 ; RV64ZVE32F-NEXT: fsd fa0, 0(a0) ; RV64ZVE32F-NEXT: fsd fa1, 8(a0) ; RV64ZVE32F-NEXT: fsd fa2, 16(a0) @@ -12566,32 +12355,6 @@ define <8 x double> @mgather_baseidx_zext_v8i16_v8f64(ptr %base, <8 x i16> %idxs ; RV64ZVE32F-NEXT: fsd fa6, 48(a0) ; RV64ZVE32F-NEXT: fsd fa7, 56(a0) ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB102_14: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 -; RV64ZVE32F-NEXT: slli a3, a3, 48 -; RV64ZVE32F-NEXT: srli a3, a3, 45 -; RV64ZVE32F-NEXT: add a3, a1, a3 -; RV64ZVE32F-NEXT: fld fa2, 0(a3) -; RV64ZVE32F-NEXT: andi a3, a2, 8 -; RV64ZVE32F-NEXT: beqz a3, .LBB102_6 -; RV64ZVE32F-NEXT: .LBB102_15: # %cond.load7 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 -; RV64ZVE32F-NEXT: slli a3, a3, 48 -; RV64ZVE32F-NEXT: srli a3, a3, 45 -; RV64ZVE32F-NEXT: add a3, a1, a3 -; RV64ZVE32F-NEXT: fld fa3, 0(a3) -; RV64ZVE32F-NEXT: andi a3, a2, 16 -; RV64ZVE32F-NEXT: beqz a3, .LBB102_7 -; RV64ZVE32F-NEXT: .LBB102_16: # %cond.load10 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 -; RV64ZVE32F-NEXT: slli a3, a3, 48 -; RV64ZVE32F-NEXT: srli a3, a3, 45 -; RV64ZVE32F-NEXT: add a3, a1, a3 -; RV64ZVE32F-NEXT: fld fa4, 0(a3) -; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: bnez a3, .LBB102_8 -; RV64ZVE32F-NEXT: j .LBB102_9 %eidxs = zext <8 x i16> %idxs to <8 x i64> %ptrs = getelementptr inbounds double, ptr %base, <8 x i64> %eidxs %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> %ptrs, i32 8, <8 x i1> %m, <8 x double> %passthru) @@ -12736,46 +12499,53 @@ define <8 x double> @mgather_baseidx_v8i32_v8f64(ptr %base, <8 x i32> %idxs, <8 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa1, 0(a3) ; RV64ZVE32F-NEXT: .LBB103_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: andi a3, a2, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a3, .LBB103_14 -; RV64ZVE32F-NEXT: # %bb.5: # %else5 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB103_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa2, 0(a3) +; RV64ZVE32F-NEXT: .LBB103_6: # %else5 ; RV64ZVE32F-NEXT: andi a3, a2, 8 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 ; RV64ZVE32F-NEXT: bnez a3, .LBB103_15 -; RV64ZVE32F-NEXT: .LBB103_6: # %else8 +; RV64ZVE32F-NEXT: # %bb.7: # %else8 ; RV64ZVE32F-NEXT: andi a3, a2, 16 ; RV64ZVE32F-NEXT: bnez a3, .LBB103_16 -; RV64ZVE32F-NEXT: .LBB103_7: # %else11 +; RV64ZVE32F-NEXT: .LBB103_8: # %else11 ; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: beqz a3, .LBB103_9 -; RV64ZVE32F-NEXT: .LBB103_8: # %cond.load13 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: beqz a3, .LBB103_10 +; RV64ZVE32F-NEXT: .LBB103_9: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa5, 0(a3) -; RV64ZVE32F-NEXT: .LBB103_9: # %else14 +; RV64ZVE32F-NEXT: .LBB103_10: # %else14 ; RV64ZVE32F-NEXT: andi a3, a2, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 -; RV64ZVE32F-NEXT: beqz a3, .LBB103_11 -; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB103_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa6, 0(a3) -; RV64ZVE32F-NEXT: .LBB103_11: # %else17 +; RV64ZVE32F-NEXT: .LBB103_12: # %else17 ; RV64ZVE32F-NEXT: andi a2, a2, -128 -; RV64ZVE32F-NEXT: beqz a2, .LBB103_13 -; RV64ZVE32F-NEXT: # %bb.12: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a2, .LBB103_14 +; RV64ZVE32F-NEXT: # %bb.13: # %cond.load19 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: fld fa7, 0(a1) -; RV64ZVE32F-NEXT: .LBB103_13: # %else20 +; RV64ZVE32F-NEXT: .LBB103_14: # %else20 ; RV64ZVE32F-NEXT: fsd fa0, 0(a0) ; RV64ZVE32F-NEXT: fsd fa1, 8(a0) ; RV64ZVE32F-NEXT: fsd fa2, 16(a0) @@ -12785,29 +12555,24 @@ define <8 x double> @mgather_baseidx_v8i32_v8f64(ptr %base, <8 x i32> %idxs, <8 ; RV64ZVE32F-NEXT: fsd fa6, 48(a0) ; RV64ZVE32F-NEXT: fsd fa7, 56(a0) ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB103_14: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 -; RV64ZVE32F-NEXT: slli a3, a3, 3 -; RV64ZVE32F-NEXT: add a3, a1, a3 -; RV64ZVE32F-NEXT: fld fa2, 0(a3) -; RV64ZVE32F-NEXT: andi a3, a2, 8 -; RV64ZVE32F-NEXT: beqz a3, .LBB103_6 ; RV64ZVE32F-NEXT: .LBB103_15: # %cond.load7 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa3, 0(a3) ; RV64ZVE32F-NEXT: andi a3, a2, 16 -; RV64ZVE32F-NEXT: beqz a3, .LBB103_7 +; RV64ZVE32F-NEXT: beqz a3, .LBB103_8 ; RV64ZVE32F-NEXT: .LBB103_16: # %cond.load10 -; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa4, 0(a3) ; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: bnez a3, .LBB103_8 -; RV64ZVE32F-NEXT: j .LBB103_9 +; RV64ZVE32F-NEXT: bnez a3, .LBB103_9 +; RV64ZVE32F-NEXT: j .LBB103_10 %ptrs = getelementptr inbounds double, ptr %base, <8 x i32> %idxs %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> %ptrs, i32 8, <8 x i1> %m, <8 x double> %passthru) ret <8 x double> %v @@ -12951,46 +12716,53 @@ define <8 x double> @mgather_baseidx_sext_v8i32_v8f64(ptr %base, <8 x i32> %idxs ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa1, 0(a3) ; RV64ZVE32F-NEXT: .LBB104_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: andi a3, a2, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a3, .LBB104_14 -; RV64ZVE32F-NEXT: # %bb.5: # %else5 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB104_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa2, 0(a3) +; RV64ZVE32F-NEXT: .LBB104_6: # %else5 ; RV64ZVE32F-NEXT: andi a3, a2, 8 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 ; RV64ZVE32F-NEXT: bnez a3, .LBB104_15 -; RV64ZVE32F-NEXT: .LBB104_6: # %else8 +; RV64ZVE32F-NEXT: # %bb.7: # %else8 ; RV64ZVE32F-NEXT: andi a3, a2, 16 ; RV64ZVE32F-NEXT: bnez a3, .LBB104_16 -; RV64ZVE32F-NEXT: .LBB104_7: # %else11 +; RV64ZVE32F-NEXT: .LBB104_8: # %else11 ; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: beqz a3, .LBB104_9 -; RV64ZVE32F-NEXT: .LBB104_8: # %cond.load13 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: beqz a3, .LBB104_10 +; RV64ZVE32F-NEXT: .LBB104_9: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa5, 0(a3) -; RV64ZVE32F-NEXT: .LBB104_9: # %else14 +; RV64ZVE32F-NEXT: .LBB104_10: # %else14 ; RV64ZVE32F-NEXT: andi a3, a2, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 -; RV64ZVE32F-NEXT: beqz a3, .LBB104_11 -; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB104_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa6, 0(a3) -; RV64ZVE32F-NEXT: .LBB104_11: # %else17 +; RV64ZVE32F-NEXT: .LBB104_12: # %else17 ; RV64ZVE32F-NEXT: andi a2, a2, -128 -; RV64ZVE32F-NEXT: beqz a2, .LBB104_13 -; RV64ZVE32F-NEXT: # %bb.12: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a2, .LBB104_14 +; RV64ZVE32F-NEXT: # %bb.13: # %cond.load19 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: fld fa7, 0(a1) -; RV64ZVE32F-NEXT: .LBB104_13: # %else20 +; RV64ZVE32F-NEXT: .LBB104_14: # %else20 ; RV64ZVE32F-NEXT: fsd fa0, 0(a0) ; RV64ZVE32F-NEXT: fsd fa1, 8(a0) ; RV64ZVE32F-NEXT: fsd fa2, 16(a0) @@ -13000,29 +12772,24 @@ define <8 x double> @mgather_baseidx_sext_v8i32_v8f64(ptr %base, <8 x i32> %idxs ; RV64ZVE32F-NEXT: fsd fa6, 48(a0) ; RV64ZVE32F-NEXT: fsd fa7, 56(a0) ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB104_14: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 -; RV64ZVE32F-NEXT: slli a3, a3, 3 -; RV64ZVE32F-NEXT: add a3, a1, a3 -; RV64ZVE32F-NEXT: fld fa2, 0(a3) -; RV64ZVE32F-NEXT: andi a3, a2, 8 -; RV64ZVE32F-NEXT: beqz a3, .LBB104_6 ; RV64ZVE32F-NEXT: .LBB104_15: # %cond.load7 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa3, 0(a3) ; RV64ZVE32F-NEXT: andi a3, a2, 16 -; RV64ZVE32F-NEXT: beqz a3, .LBB104_7 +; RV64ZVE32F-NEXT: beqz a3, .LBB104_8 ; RV64ZVE32F-NEXT: .LBB104_16: # %cond.load10 -; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa4, 0(a3) ; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: bnez a3, .LBB104_8 -; RV64ZVE32F-NEXT: j .LBB104_9 +; RV64ZVE32F-NEXT: bnez a3, .LBB104_9 +; RV64ZVE32F-NEXT: j .LBB104_10 %eidxs = sext <8 x i32> %idxs to <8 x i64> %ptrs = getelementptr inbounds double, ptr %base, <8 x i64> %eidxs %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> %ptrs, i32 8, <8 x i1> %m, <8 x double> %passthru) @@ -13169,49 +12936,57 @@ define <8 x double> @mgather_baseidx_zext_v8i32_v8f64(ptr %base, <8 x i32> %idxs ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa1, 0(a3) ; RV64ZVE32F-NEXT: .LBB105_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: andi a3, a2, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a3, .LBB105_14 -; RV64ZVE32F-NEXT: # %bb.5: # %else5 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB105_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: slli a3, a3, 32 +; RV64ZVE32F-NEXT: srli a3, a3, 29 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa2, 0(a3) +; RV64ZVE32F-NEXT: .LBB105_6: # %else5 ; RV64ZVE32F-NEXT: andi a3, a2, 8 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 ; RV64ZVE32F-NEXT: bnez a3, .LBB105_15 -; RV64ZVE32F-NEXT: .LBB105_6: # %else8 +; RV64ZVE32F-NEXT: # %bb.7: # %else8 ; RV64ZVE32F-NEXT: andi a3, a2, 16 ; RV64ZVE32F-NEXT: bnez a3, .LBB105_16 -; RV64ZVE32F-NEXT: .LBB105_7: # %else11 +; RV64ZVE32F-NEXT: .LBB105_8: # %else11 ; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: beqz a3, .LBB105_9 -; RV64ZVE32F-NEXT: .LBB105_8: # %cond.load13 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: beqz a3, .LBB105_10 +; RV64ZVE32F-NEXT: .LBB105_9: # %cond.load13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 ; RV64ZVE32F-NEXT: slli a3, a3, 32 ; RV64ZVE32F-NEXT: srli a3, a3, 29 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa5, 0(a3) -; RV64ZVE32F-NEXT: .LBB105_9: # %else14 +; RV64ZVE32F-NEXT: .LBB105_10: # %else14 ; RV64ZVE32F-NEXT: andi a3, a2, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 -; RV64ZVE32F-NEXT: beqz a3, .LBB105_11 -; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB105_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 32 ; RV64ZVE32F-NEXT: srli a3, a3, 29 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa6, 0(a3) -; RV64ZVE32F-NEXT: .LBB105_11: # %else17 +; RV64ZVE32F-NEXT: .LBB105_12: # %else17 ; RV64ZVE32F-NEXT: andi a2, a2, -128 -; RV64ZVE32F-NEXT: beqz a2, .LBB105_13 -; RV64ZVE32F-NEXT: # %bb.12: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a2, .LBB105_14 +; RV64ZVE32F-NEXT: # %bb.13: # %cond.load19 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 32 ; RV64ZVE32F-NEXT: srli a2, a2, 29 ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: fld fa7, 0(a1) -; RV64ZVE32F-NEXT: .LBB105_13: # %else20 +; RV64ZVE32F-NEXT: .LBB105_14: # %else20 ; RV64ZVE32F-NEXT: fsd fa0, 0(a0) ; RV64ZVE32F-NEXT: fsd fa1, 8(a0) ; RV64ZVE32F-NEXT: fsd fa2, 16(a0) @@ -13221,32 +12996,26 @@ define <8 x double> @mgather_baseidx_zext_v8i32_v8f64(ptr %base, <8 x i32> %idxs ; RV64ZVE32F-NEXT: fsd fa6, 48(a0) ; RV64ZVE32F-NEXT: fsd fa7, 56(a0) ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB105_14: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 -; RV64ZVE32F-NEXT: slli a3, a3, 32 -; RV64ZVE32F-NEXT: srli a3, a3, 29 -; RV64ZVE32F-NEXT: add a3, a1, a3 -; RV64ZVE32F-NEXT: fld fa2, 0(a3) -; RV64ZVE32F-NEXT: andi a3, a2, 8 -; RV64ZVE32F-NEXT: beqz a3, .LBB105_6 ; RV64ZVE32F-NEXT: .LBB105_15: # %cond.load7 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 ; RV64ZVE32F-NEXT: slli a3, a3, 32 ; RV64ZVE32F-NEXT: srli a3, a3, 29 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa3, 0(a3) ; RV64ZVE32F-NEXT: andi a3, a2, 16 -; RV64ZVE32F-NEXT: beqz a3, .LBB105_7 +; RV64ZVE32F-NEXT: beqz a3, .LBB105_8 ; RV64ZVE32F-NEXT: .LBB105_16: # %cond.load10 -; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 32 ; RV64ZVE32F-NEXT: srli a3, a3, 29 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa4, 0(a3) ; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: bnez a3, .LBB105_8 -; RV64ZVE32F-NEXT: j .LBB105_9 +; RV64ZVE32F-NEXT: bnez a3, .LBB105_9 +; RV64ZVE32F-NEXT: j .LBB105_10 %eidxs = zext <8 x i32> %idxs to <8 x i64> %ptrs = getelementptr inbounds double, ptr %base, <8 x i64> %eidxs %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> %ptrs, i32 8, <8 x i1> %m, <8 x double> %passthru) @@ -13526,31 +13295,45 @@ define <16 x i8> @mgather_baseidx_v16i8(ptr %base, <16 x i8> %idxs, <16 x i1> %m ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1 ; RV64ZVE32F-NEXT: .LBB107_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB107_25 -; RV64ZVE32F-NEXT: # %bb.5: # %else5 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB107_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lbu a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v11, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e8, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-NEXT: .LBB107_6: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB107_26 -; RV64ZVE32F-NEXT: .LBB107_6: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: beqz a2, .LBB107_8 -; RV64ZVE32F-NEXT: .LBB107_7: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 5, e8, m1, tu, ma +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lbu a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3 +; RV64ZVE32F-NEXT: .LBB107_8: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB107_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) ; RV64ZVE32F-NEXT: vmv.s.x v11, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e8, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 4 -; RV64ZVE32F-NEXT: .LBB107_8: # %else11 +; RV64ZVE32F-NEXT: .LBB107_10: # %else11 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB107_10 -; RV64ZVE32F-NEXT: # %bb.9: # %cond.load13 +; RV64ZVE32F-NEXT: beqz a2, .LBB107_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v11 @@ -13559,71 +13342,119 @@ define <16 x i8> @mgather_baseidx_v16i8(ptr %base, <16 x i8> %idxs, <16 x i1> %m ; RV64ZVE32F-NEXT: vmv.s.x v11, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e8, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 5 -; RV64ZVE32F-NEXT: .LBB107_10: # %else14 +; RV64ZVE32F-NEXT: .LBB107_12: # %else14 ; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB107_27 -; RV64ZVE32F-NEXT: # %bb.11: # %else17 +; RV64ZVE32F-NEXT: beqz a2, .LBB107_14 +; RV64ZVE32F-NEXT: # %bb.13: # %cond.load16 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lbu a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v11, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 7, e8, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 6 +; RV64ZVE32F-NEXT: .LBB107_14: # %else17 ; RV64ZVE32F-NEXT: andi a2, a1, 128 -; RV64ZVE32F-NEXT: bnez a2, .LBB107_28 -; RV64ZVE32F-NEXT: .LBB107_12: # %else20 -; RV64ZVE32F-NEXT: andi a2, a1, 256 -; RV64ZVE32F-NEXT: bnez a2, .LBB107_29 -; RV64ZVE32F-NEXT: .LBB107_13: # %else23 -; RV64ZVE32F-NEXT: andi a2, a1, 512 -; RV64ZVE32F-NEXT: beqz a2, .LBB107_15 -; RV64ZVE32F-NEXT: .LBB107_14: # %cond.load25 +; RV64ZVE32F-NEXT: beqz a2, .LBB107_16 +; RV64ZVE32F-NEXT: # %bb.15: # %cond.load19 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 10, e8, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 9 -; RV64ZVE32F-NEXT: .LBB107_15: # %else26 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 1024 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 7 +; RV64ZVE32F-NEXT: .LBB107_16: # %else20 +; RV64ZVE32F-NEXT: andi a2, a1, 256 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB107_18 +; RV64ZVE32F-NEXT: # %bb.17: # %cond.load22 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lbu a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 9, e8, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 8 +; RV64ZVE32F-NEXT: .LBB107_18: # %else23 +; RV64ZVE32F-NEXT: andi a2, a1, 512 +; RV64ZVE32F-NEXT: beqz a2, .LBB107_20 +; RV64ZVE32F-NEXT: # %bb.19: # %cond.load25 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lbu a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 10, e8, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 9 +; RV64ZVE32F-NEXT: .LBB107_20: # %else26 +; RV64ZVE32F-NEXT: andi a2, a1, 1024 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB107_30 -; RV64ZVE32F-NEXT: # %bb.16: # %else29 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB107_22 +; RV64ZVE32F-NEXT: # %bb.21: # %cond.load28 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lbu a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v11, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 11, e8, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 10 +; RV64ZVE32F-NEXT: .LBB107_22: # %else29 ; RV64ZVE32F-NEXT: slli a2, a1, 52 -; RV64ZVE32F-NEXT: bltz a2, .LBB107_31 -; RV64ZVE32F-NEXT: .LBB107_17: # %else32 +; RV64ZVE32F-NEXT: bgez a2, .LBB107_24 +; RV64ZVE32F-NEXT: # %bb.23: # %cond.load31 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lbu a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 12, e8, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 11 +; RV64ZVE32F-NEXT: .LBB107_24: # %else32 ; RV64ZVE32F-NEXT: slli a2, a1, 51 -; RV64ZVE32F-NEXT: bltz a2, .LBB107_32 -; RV64ZVE32F-NEXT: .LBB107_18: # %else35 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: bgez a2, .LBB107_26 +; RV64ZVE32F-NEXT: # %bb.25: # %cond.load34 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lbu a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 13, e8, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 12 +; RV64ZVE32F-NEXT: .LBB107_26: # %else35 ; RV64ZVE32F-NEXT: slli a2, a1, 50 -; RV64ZVE32F-NEXT: bgez a2, .LBB107_20 -; RV64ZVE32F-NEXT: .LBB107_19: # %cond.load37 +; RV64ZVE32F-NEXT: bgez a2, .LBB107_28 +; RV64ZVE32F-NEXT: # %bb.27: # %cond.load37 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 14, e8, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 13 -; RV64ZVE32F-NEXT: .LBB107_20: # %else38 +; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 13 +; RV64ZVE32F-NEXT: .LBB107_28: # %else38 ; RV64ZVE32F-NEXT: slli a2, a1, 49 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 -; RV64ZVE32F-NEXT: bgez a2, .LBB107_22 -; RV64ZVE32F-NEXT: # %bb.21: # %cond.load40 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bgez a2, .LBB107_30 +; RV64ZVE32F-NEXT: # %bb.29: # %cond.load40 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 15, e8, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 14 -; RV64ZVE32F-NEXT: .LBB107_22: # %else41 +; RV64ZVE32F-NEXT: .LBB107_30: # %else41 ; RV64ZVE32F-NEXT: lui a2, 1048568 ; RV64ZVE32F-NEXT: and a1, a1, a2 -; RV64ZVE32F-NEXT: beqz a1, .LBB107_24 -; RV64ZVE32F-NEXT: # %bb.23: # %cond.load43 +; RV64ZVE32F-NEXT: beqz a1, .LBB107_32 +; RV64ZVE32F-NEXT: # %bb.31: # %cond.load43 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v8 @@ -13632,91 +13463,10 @@ define <16 x i8> @mgather_baseidx_v16i8(ptr %base, <16 x i8> %idxs, <16 x i1> %m ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 15 -; RV64ZVE32F-NEXT: .LBB107_24: # %else44 +; RV64ZVE32F-NEXT: .LBB107_32: # %else44 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vmv1r.v v8, v9 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB107_25: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e8, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v12, 2 -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB107_6 -; RV64ZVE32F-NEXT: .LBB107_26: # %cond.load7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v11, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 3 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB107_7 -; RV64ZVE32F-NEXT: j .LBB107_8 -; RV64ZVE32F-NEXT: .LBB107_27: # %cond.load16 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v11, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 7, e8, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 6 -; RV64ZVE32F-NEXT: andi a2, a1, 128 -; RV64ZVE32F-NEXT: beqz a2, .LBB107_12 -; RV64ZVE32F-NEXT: .LBB107_28: # %cond.load19 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 7 -; RV64ZVE32F-NEXT: andi a2, a1, 256 -; RV64ZVE32F-NEXT: beqz a2, .LBB107_13 -; RV64ZVE32F-NEXT: .LBB107_29: # %cond.load22 -; RV64ZVE32F-NEXT: vsetivli zero, 9, e8, m1, tu, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 8 -; RV64ZVE32F-NEXT: andi a2, a1, 512 -; RV64ZVE32F-NEXT: bnez a2, .LBB107_14 -; RV64ZVE32F-NEXT: j .LBB107_15 -; RV64ZVE32F-NEXT: .LBB107_30: # %cond.load28 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v11, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 11, e8, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 10 -; RV64ZVE32F-NEXT: slli a2, a1, 52 -; RV64ZVE32F-NEXT: bgez a2, .LBB107_17 -; RV64ZVE32F-NEXT: .LBB107_31: # %cond.load31 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v8, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 12, e8, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 11 -; RV64ZVE32F-NEXT: slli a2, a1, 51 -; RV64ZVE32F-NEXT: bgez a2, .LBB107_18 -; RV64ZVE32F-NEXT: .LBB107_32: # %cond.load34 -; RV64ZVE32F-NEXT: vsetivli zero, 13, e8, m1, tu, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v8, a2 -; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 12 -; RV64ZVE32F-NEXT: slli a2, a1, 50 -; RV64ZVE32F-NEXT: bltz a2, .LBB107_19 -; RV64ZVE32F-NEXT: j .LBB107_20 %ptrs = getelementptr inbounds i8, ptr %base, <16 x i8> %idxs %v = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %ptrs, i32 2, <16 x i1> %m, <16 x i8> %passthru) ret <16 x i8> %v @@ -13781,54 +13531,93 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 ; RV64ZVE32F-NEXT: .LBB108_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v13, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB108_49 -; RV64ZVE32F-NEXT: # %bb.5: # %else5 +; RV64ZVE32F-NEXT: beqz a2, .LBB108_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lbu a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v13, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e8, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v13, 2 +; RV64ZVE32F-NEXT: .LBB108_6: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB108_50 -; RV64ZVE32F-NEXT: .LBB108_6: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: beqz a2, .LBB108_8 -; RV64ZVE32F-NEXT: .LBB108_7: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 5, e8, m1, tu, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v13 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.load7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) ; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 -; RV64ZVE32F-NEXT: .LBB108_8: # %else11 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 8 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3 +; RV64ZVE32F-NEXT: .LBB108_8: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 ; RV64ZVE32F-NEXT: beqz a2, .LBB108_10 -; RV64ZVE32F-NEXT: # %bb.9: # %cond.load13 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lbu a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v13, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e8, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v13, 4 +; RV64ZVE32F-NEXT: .LBB108_10: # %else11 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB108_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.load13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v14, v13, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v14 +; RV64ZVE32F-NEXT: vslidedown.vi v13, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v13 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v14, a2 +; RV64ZVE32F-NEXT: vmv.s.x v13, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e8, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 5 -; RV64ZVE32F-NEXT: .LBB108_10: # %else14 +; RV64ZVE32F-NEXT: vslideup.vi v10, v13, 5 +; RV64ZVE32F-NEXT: .LBB108_12: # %else14 ; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v13, v13, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB108_51 -; RV64ZVE32F-NEXT: # %bb.11: # %else17 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB108_14 +; RV64ZVE32F-NEXT: # %bb.13: # %cond.load16 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lbu a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v13, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 7, e8, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v13, 6 +; RV64ZVE32F-NEXT: .LBB108_14: # %else17 ; RV64ZVE32F-NEXT: andi a2, a1, 128 -; RV64ZVE32F-NEXT: bnez a2, .LBB108_52 -; RV64ZVE32F-NEXT: .LBB108_12: # %else20 +; RV64ZVE32F-NEXT: beqz a2, .LBB108_16 +; RV64ZVE32F-NEXT: # %bb.15: # %cond.load19 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lbu a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 7 +; RV64ZVE32F-NEXT: .LBB108_16: # %else20 ; RV64ZVE32F-NEXT: andi a2, a1, 256 -; RV64ZVE32F-NEXT: bnez a2, .LBB108_53 -; RV64ZVE32F-NEXT: .LBB108_13: # %else23 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB108_18 +; RV64ZVE32F-NEXT: # %bb.17: # %cond.load22 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lbu a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v13, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 9, e8, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v13, 8 +; RV64ZVE32F-NEXT: .LBB108_18: # %else23 ; RV64ZVE32F-NEXT: andi a2, a1, 512 -; RV64ZVE32F-NEXT: beqz a2, .LBB108_15 -; RV64ZVE32F-NEXT: .LBB108_14: # %cond.load25 +; RV64ZVE32F-NEXT: beqz a2, .LBB108_20 +; RV64ZVE32F-NEXT: # %bb.19: # %cond.load25 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v13, v12, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v13 @@ -13837,71 +13626,78 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m ; RV64ZVE32F-NEXT: vmv.s.x v13, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 10, e8, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v13, 9 -; RV64ZVE32F-NEXT: .LBB108_15: # %else26 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v13, v12, 4 +; RV64ZVE32F-NEXT: .LBB108_20: # %else26 ; RV64ZVE32F-NEXT: andi a2, a1, 1024 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB108_17 -; RV64ZVE32F-NEXT: # %bb.16: # %cond.load28 -; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: vslidedown.vi v13, v12, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB108_22 +; RV64ZVE32F-NEXT: # %bb.21: # %cond.load28 +; RV64ZVE32F-NEXT: vmv.x.s a2, v13 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) ; RV64ZVE32F-NEXT: vmv.s.x v14, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 11, e8, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 10 -; RV64ZVE32F-NEXT: .LBB108_17: # %else29 +; RV64ZVE32F-NEXT: .LBB108_22: # %else29 ; RV64ZVE32F-NEXT: slli a2, a1, 52 -; RV64ZVE32F-NEXT: bgez a2, .LBB108_19 -; RV64ZVE32F-NEXT: # %bb.18: # %cond.load31 +; RV64ZVE32F-NEXT: bgez a2, .LBB108_24 +; RV64ZVE32F-NEXT: # %bb.23: # %cond.load31 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: vslidedown.vi v13, v13, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v13 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vmv.s.x v13, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 12, e8, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 11 -; RV64ZVE32F-NEXT: .LBB108_19: # %else32 +; RV64ZVE32F-NEXT: vslideup.vi v10, v13, 11 +; RV64ZVE32F-NEXT: .LBB108_24: # %else32 ; RV64ZVE32F-NEXT: slli a2, a1, 51 -; RV64ZVE32F-NEXT: vsetivli zero, 16, e8, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 16 -; RV64ZVE32F-NEXT: bgez a2, .LBB108_21 -; RV64ZVE32F-NEXT: # %bb.20: # %cond.load34 -; RV64ZVE32F-NEXT: vmv.x.s a2, v13 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 4 +; RV64ZVE32F-NEXT: bgez a2, .LBB108_26 +; RV64ZVE32F-NEXT: # %bb.25: # %cond.load34 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v9, a2 +; RV64ZVE32F-NEXT: vmv.s.x v13, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 13, e8, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 12 -; RV64ZVE32F-NEXT: .LBB108_21: # %else35 +; RV64ZVE32F-NEXT: vslideup.vi v10, v13, 12 +; RV64ZVE32F-NEXT: .LBB108_26: # %else35 ; RV64ZVE32F-NEXT: slli a2, a1, 50 -; RV64ZVE32F-NEXT: bgez a2, .LBB108_23 -; RV64ZVE32F-NEXT: # %bb.22: # %cond.load37 +; RV64ZVE32F-NEXT: bgez a2, .LBB108_28 +; RV64ZVE32F-NEXT: # %bb.27: # %cond.load37 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v13, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v13, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v13 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v9, a2 +; RV64ZVE32F-NEXT: vmv.s.x v13, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 14, e8, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 13 -; RV64ZVE32F-NEXT: .LBB108_23: # %else38 +; RV64ZVE32F-NEXT: vslideup.vi v10, v13, 13 +; RV64ZVE32F-NEXT: .LBB108_28: # %else38 ; RV64ZVE32F-NEXT: slli a2, a1, 49 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v13, 2 -; RV64ZVE32F-NEXT: bltz a2, .LBB108_54 -; RV64ZVE32F-NEXT: # %bb.24: # %else41 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 2 +; RV64ZVE32F-NEXT: bgez a2, .LBB108_30 +; RV64ZVE32F-NEXT: # %bb.29: # %cond.load40 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lbu a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v13, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 15, e8, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v13, 14 +; RV64ZVE32F-NEXT: .LBB108_30: # %else41 ; RV64ZVE32F-NEXT: slli a2, a1, 48 -; RV64ZVE32F-NEXT: bltz a2, .LBB108_55 -; RV64ZVE32F-NEXT: .LBB108_25: # %else44 +; RV64ZVE32F-NEXT: vsetivli zero, 16, e8, m2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 16 +; RV64ZVE32F-NEXT: bltz a2, .LBB108_63 +; RV64ZVE32F-NEXT: # %bb.31: # %else44 ; RV64ZVE32F-NEXT: slli a2, a1, 47 -; RV64ZVE32F-NEXT: bltz a2, .LBB108_56 -; RV64ZVE32F-NEXT: .LBB108_26: # %else47 +; RV64ZVE32F-NEXT: bltz a2, .LBB108_64 +; RV64ZVE32F-NEXT: .LBB108_32: # %else47 ; RV64ZVE32F-NEXT: slli a2, a1, 46 -; RV64ZVE32F-NEXT: bgez a2, .LBB108_28 -; RV64ZVE32F-NEXT: .LBB108_27: # %cond.load49 +; RV64ZVE32F-NEXT: bgez a2, .LBB108_34 +; RV64ZVE32F-NEXT: .LBB108_33: # %cond.load49 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 @@ -13910,32 +13706,46 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m ; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 18, e8, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 17 -; RV64ZVE32F-NEXT: .LBB108_28: # %else50 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: .LBB108_34: # %else50 ; RV64ZVE32F-NEXT: slli a2, a1, 45 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 -; RV64ZVE32F-NEXT: bltz a2, .LBB108_57 -; RV64ZVE32F-NEXT: # %bb.29: # %else53 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: bgez a2, .LBB108_36 +; RV64ZVE32F-NEXT: # %bb.35: # %cond.load52 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lbu a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 19, e8, m2, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 18 +; RV64ZVE32F-NEXT: .LBB108_36: # %else53 ; RV64ZVE32F-NEXT: slli a2, a1, 44 -; RV64ZVE32F-NEXT: bltz a2, .LBB108_58 -; RV64ZVE32F-NEXT: .LBB108_30: # %else56 +; RV64ZVE32F-NEXT: bgez a2, .LBB108_38 +; RV64ZVE32F-NEXT: # %bb.37: # %cond.load55 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lbu a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 20, e8, m2, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 19 +; RV64ZVE32F-NEXT: .LBB108_38: # %else56 ; RV64ZVE32F-NEXT: slli a2, a1, 43 -; RV64ZVE32F-NEXT: bgez a2, .LBB108_32 -; RV64ZVE32F-NEXT: .LBB108_31: # %cond.load58 -; RV64ZVE32F-NEXT: vsetivli zero, 21, e8, m2, tu, ma +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: bgez a2, .LBB108_40 +; RV64ZVE32F-NEXT: # %bb.39: # %cond.load58 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) ; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 21, e8, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 20 -; RV64ZVE32F-NEXT: .LBB108_32: # %else59 +; RV64ZVE32F-NEXT: .LBB108_40: # %else59 ; RV64ZVE32F-NEXT: slli a2, a1, 42 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 8 -; RV64ZVE32F-NEXT: bgez a2, .LBB108_34 -; RV64ZVE32F-NEXT: # %bb.33: # %cond.load61 +; RV64ZVE32F-NEXT: bgez a2, .LBB108_42 +; RV64ZVE32F-NEXT: # %bb.41: # %cond.load61 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v9, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v12 @@ -13944,21 +13754,46 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m ; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 22, e8, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 21 -; RV64ZVE32F-NEXT: .LBB108_34: # %else62 +; RV64ZVE32F-NEXT: .LBB108_42: # %else62 ; RV64ZVE32F-NEXT: slli a2, a1, 41 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 -; RV64ZVE32F-NEXT: bltz a2, .LBB108_59 -; RV64ZVE32F-NEXT: # %bb.35: # %else65 +; RV64ZVE32F-NEXT: bgez a2, .LBB108_44 +; RV64ZVE32F-NEXT: # %bb.43: # %cond.load64 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lbu a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 23, e8, m2, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 22 +; RV64ZVE32F-NEXT: .LBB108_44: # %else65 ; RV64ZVE32F-NEXT: slli a2, a1, 40 -; RV64ZVE32F-NEXT: bltz a2, .LBB108_60 -; RV64ZVE32F-NEXT: .LBB108_36: # %else68 +; RV64ZVE32F-NEXT: bgez a2, .LBB108_46 +; RV64ZVE32F-NEXT: # %bb.45: # %cond.load67 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lbu a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 24, e8, m2, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 23 +; RV64ZVE32F-NEXT: .LBB108_46: # %else68 ; RV64ZVE32F-NEXT: slli a2, a1, 39 -; RV64ZVE32F-NEXT: bltz a2, .LBB108_61 -; RV64ZVE32F-NEXT: .LBB108_37: # %else71 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 8 +; RV64ZVE32F-NEXT: bgez a2, .LBB108_48 +; RV64ZVE32F-NEXT: # %bb.47: # %cond.load70 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lbu a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 25, e8, m2, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 24 +; RV64ZVE32F-NEXT: .LBB108_48: # %else71 ; RV64ZVE32F-NEXT: slli a2, a1, 38 -; RV64ZVE32F-NEXT: bgez a2, .LBB108_39 -; RV64ZVE32F-NEXT: .LBB108_38: # %cond.load73 +; RV64ZVE32F-NEXT: bgez a2, .LBB108_50 +; RV64ZVE32F-NEXT: # %bb.49: # %cond.load73 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 @@ -13967,48 +13802,71 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m ; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 26, e8, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 25 -; RV64ZVE32F-NEXT: .LBB108_39: # %else74 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: .LBB108_50: # %else74 ; RV64ZVE32F-NEXT: slli a2, a1, 37 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bltz a2, .LBB108_62 -; RV64ZVE32F-NEXT: # %bb.40: # %else77 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: bgez a2, .LBB108_52 +; RV64ZVE32F-NEXT: # %bb.51: # %cond.load76 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lbu a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 27, e8, m2, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 26 +; RV64ZVE32F-NEXT: .LBB108_52: # %else77 ; RV64ZVE32F-NEXT: slli a2, a1, 36 -; RV64ZVE32F-NEXT: bltz a2, .LBB108_63 -; RV64ZVE32F-NEXT: .LBB108_41: # %else80 +; RV64ZVE32F-NEXT: bgez a2, .LBB108_54 +; RV64ZVE32F-NEXT: # %bb.53: # %cond.load79 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lbu a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 28, e8, m2, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 27 +; RV64ZVE32F-NEXT: .LBB108_54: # %else80 ; RV64ZVE32F-NEXT: slli a2, a1, 35 -; RV64ZVE32F-NEXT: bltz a2, .LBB108_64 -; RV64ZVE32F-NEXT: .LBB108_42: # %else83 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: bgez a2, .LBB108_56 +; RV64ZVE32F-NEXT: # %bb.55: # %cond.load82 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lbu a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 29, e8, m2, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 28 +; RV64ZVE32F-NEXT: .LBB108_56: # %else83 ; RV64ZVE32F-NEXT: slli a2, a1, 34 -; RV64ZVE32F-NEXT: bgez a2, .LBB108_44 -; RV64ZVE32F-NEXT: .LBB108_43: # %cond.load85 +; RV64ZVE32F-NEXT: bgez a2, .LBB108_58 +; RV64ZVE32F-NEXT: # %bb.57: # %cond.load85 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) ; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 30, e8, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 29 -; RV64ZVE32F-NEXT: .LBB108_44: # %else86 +; RV64ZVE32F-NEXT: .LBB108_58: # %else86 ; RV64ZVE32F-NEXT: slli a2, a1, 33 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 -; RV64ZVE32F-NEXT: bgez a2, .LBB108_46 -; RV64ZVE32F-NEXT: # %bb.45: # %cond.load88 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bgez a2, .LBB108_60 +; RV64ZVE32F-NEXT: # %bb.59: # %cond.load88 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) ; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 31, e8, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 30 -; RV64ZVE32F-NEXT: .LBB108_46: # %else89 +; RV64ZVE32F-NEXT: .LBB108_60: # %else89 ; RV64ZVE32F-NEXT: lui a2, 524288 ; RV64ZVE32F-NEXT: and a1, a1, a2 -; RV64ZVE32F-NEXT: beqz a1, .LBB108_48 -; RV64ZVE32F-NEXT: # %bb.47: # %cond.load91 +; RV64ZVE32F-NEXT: beqz a1, .LBB108_62 +; RV64ZVE32F-NEXT: # %bb.61: # %cond.load91 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v8 @@ -14018,73 +13876,13 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: vsetvli zero, a1, e8, m2, ta, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 31 -; RV64ZVE32F-NEXT: .LBB108_48: # %else92 +; RV64ZVE32F-NEXT: .LBB108_62: # %else92 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vmv2r.v v8, v10 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB108_49: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v12 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v14, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e8, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 2 -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB108_6 -; RV64ZVE32F-NEXT: .LBB108_50: # %cond.load7 +; RV64ZVE32F-NEXT: .LBB108_63: # %cond.load43 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v12 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB108_7 -; RV64ZVE32F-NEXT: j .LBB108_8 -; RV64ZVE32F-NEXT: .LBB108_51: # %cond.load16 -; RV64ZVE32F-NEXT: vmv.x.s a2, v13 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v14, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 7, e8, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 6 -; RV64ZVE32F-NEXT: andi a2, a1, 128 -; RV64ZVE32F-NEXT: beqz a2, .LBB108_12 -; RV64ZVE32F-NEXT: .LBB108_52: # %cond.load19 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v13, v13, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v13 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v13, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v13, 7 -; RV64ZVE32F-NEXT: andi a2, a1, 256 -; RV64ZVE32F-NEXT: beqz a2, .LBB108_13 -; RV64ZVE32F-NEXT: .LBB108_53: # %cond.load22 -; RV64ZVE32F-NEXT: vsetivli zero, 9, e8, m1, tu, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v12 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v13, a2 -; RV64ZVE32F-NEXT: vslideup.vi v10, v13, 8 -; RV64ZVE32F-NEXT: andi a2, a1, 512 -; RV64ZVE32F-NEXT: bnez a2, .LBB108_14 -; RV64ZVE32F-NEXT: j .LBB108_15 -; RV64ZVE32F-NEXT: .LBB108_54: # %cond.load40 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 15, e8, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 14 -; RV64ZVE32F-NEXT: slli a2, a1, 48 -; RV64ZVE32F-NEXT: bgez a2, .LBB108_25 -; RV64ZVE32F-NEXT: .LBB108_55: # %cond.load43 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v12, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) @@ -14092,8 +13890,8 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m ; RV64ZVE32F-NEXT: vsetivli zero, 16, e8, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v9, 15 ; RV64ZVE32F-NEXT: slli a2, a1, 47 -; RV64ZVE32F-NEXT: bgez a2, .LBB108_26 -; RV64ZVE32F-NEXT: .LBB108_56: # %cond.load46 +; RV64ZVE32F-NEXT: bgez a2, .LBB108_32 +; RV64ZVE32F-NEXT: .LBB108_64: # %cond.load46 ; RV64ZVE32F-NEXT: vsetivli zero, 17, e8, m2, tu, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -14101,89 +13899,8 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m ; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 16 ; RV64ZVE32F-NEXT: slli a2, a1, 46 -; RV64ZVE32F-NEXT: bltz a2, .LBB108_27 -; RV64ZVE32F-NEXT: j .LBB108_28 -; RV64ZVE32F-NEXT: .LBB108_57: # %cond.load52 -; RV64ZVE32F-NEXT: vmv.x.s a2, v12 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v14, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 19, e8, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 18 -; RV64ZVE32F-NEXT: slli a2, a1, 44 -; RV64ZVE32F-NEXT: bgez a2, .LBB108_30 -; RV64ZVE32F-NEXT: .LBB108_58: # %cond.load55 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v12 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 20, e8, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 19 -; RV64ZVE32F-NEXT: slli a2, a1, 43 -; RV64ZVE32F-NEXT: bltz a2, .LBB108_31 -; RV64ZVE32F-NEXT: j .LBB108_32 -; RV64ZVE32F-NEXT: .LBB108_59: # %cond.load64 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 23, e8, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 22 -; RV64ZVE32F-NEXT: slli a2, a1, 40 -; RV64ZVE32F-NEXT: bgez a2, .LBB108_36 -; RV64ZVE32F-NEXT: .LBB108_60: # %cond.load67 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 24, e8, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 23 -; RV64ZVE32F-NEXT: slli a2, a1, 39 -; RV64ZVE32F-NEXT: bgez a2, .LBB108_37 -; RV64ZVE32F-NEXT: .LBB108_61: # %cond.load70 -; RV64ZVE32F-NEXT: vsetivli zero, 25, e8, m2, tu, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 24 -; RV64ZVE32F-NEXT: slli a2, a1, 38 -; RV64ZVE32F-NEXT: bltz a2, .LBB108_38 -; RV64ZVE32F-NEXT: j .LBB108_39 -; RV64ZVE32F-NEXT: .LBB108_62: # %cond.load76 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 27, e8, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 26 -; RV64ZVE32F-NEXT: slli a2, a1, 36 -; RV64ZVE32F-NEXT: bgez a2, .LBB108_41 -; RV64ZVE32F-NEXT: .LBB108_63: # %cond.load79 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 28, e8, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 27 -; RV64ZVE32F-NEXT: slli a2, a1, 35 -; RV64ZVE32F-NEXT: bgez a2, .LBB108_42 -; RV64ZVE32F-NEXT: .LBB108_64: # %cond.load82 -; RV64ZVE32F-NEXT: vsetivli zero, 29, e8, m2, tu, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 28 -; RV64ZVE32F-NEXT: slli a2, a1, 34 -; RV64ZVE32F-NEXT: bltz a2, .LBB108_43 -; RV64ZVE32F-NEXT: j .LBB108_44 +; RV64ZVE32F-NEXT: bltz a2, .LBB108_33 +; RV64ZVE32F-NEXT: j .LBB108_34 %ptrs = getelementptr inbounds i8, ptr %base, <32 x i8> %idxs %v = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %ptrs, i32 2, <32 x i1> %m, <32 x i8> %passthru) ret <32 x i8> %v diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll index 4cd15f8a03d6d..e86fae6d501e5 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll @@ -511,66 +511,59 @@ define void @mscatter_baseidx_v8i8(<8 x i8> %val, ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV64ZVE32F-NEXT: vse8.v v10, (a2) ; RV64ZVE32F-NEXT: .LBB9_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB9_12 -; RV64ZVE32F-NEXT: # %bb.5: # %else4 -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB9_13 -; RV64ZVE32F-NEXT: .LBB9_6: # %else6 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB9_14 -; RV64ZVE32F-NEXT: .LBB9_7: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB9_9 -; RV64ZVE32F-NEXT: .LBB9_8: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 5 -; RV64ZVE32F-NEXT: vse8.v v9, (a2) -; RV64ZVE32F-NEXT: .LBB9_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB9_15 -; RV64ZVE32F-NEXT: # %bb.10: # %else12 -; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: bnez a1, .LBB9_16 -; RV64ZVE32F-NEXT: .LBB9_11: # %else14 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB9_12: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB9_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 ; RV64ZVE32F-NEXT: vse8.v v11, (a2) +; RV64ZVE32F-NEXT: .LBB9_6: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB9_6 -; RV64ZVE32F-NEXT: .LBB9_13: # %cond.store5 +; RV64ZVE32F-NEXT: beqz a2, .LBB9_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3 -; RV64ZVE32F-NEXT: vse8.v v9, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV64ZVE32F-NEXT: vse8.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB9_8: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB9_7 -; RV64ZVE32F-NEXT: .LBB9_14: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB9_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vse8.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB9_10: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB9_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 -; RV64ZVE32F-NEXT: vse8.v v9, (a2) -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB9_8 -; RV64ZVE32F-NEXT: j .LBB9_9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV64ZVE32F-NEXT: vse8.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB9_12: # %else10 +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB9_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB9_16 +; RV64ZVE32F-NEXT: .LBB9_14: # %else14 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB9_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -578,7 +571,7 @@ define void @mscatter_baseidx_v8i8(<8 x i8> %val, ptr %base, <8 x i8> %idxs, <8 ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV64ZVE32F-NEXT: vse8.v v10, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB9_11 +; RV64ZVE32F-NEXT: beqz a1, .LBB9_14 ; RV64ZVE32F-NEXT: .LBB9_16: # %cond.store13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 @@ -1033,71 +1026,64 @@ define void @mscatter_baseidx_v8i8_v8i16(<8 x i16> %val, ptr %base, <8 x i8> %id ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV64ZVE32F-NEXT: vse16.v v10, (a2) ; RV64ZVE32F-NEXT: .LBB18_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB18_12 -; RV64ZVE32F-NEXT: # %bb.5: # %else4 -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB18_13 -; RV64ZVE32F-NEXT: .LBB18_6: # %else6 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB18_14 -; RV64ZVE32F-NEXT: .LBB18_7: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB18_9 -; RV64ZVE32F-NEXT: .LBB18_8: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 5 -; RV64ZVE32F-NEXT: vse16.v v9, (a2) -; RV64ZVE32F-NEXT: .LBB18_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB18_15 -; RV64ZVE32F-NEXT: # %bb.10: # %else12 -; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: bnez a1, .LBB18_16 -; RV64ZVE32F-NEXT: .LBB18_11: # %else14 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB18_12: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB18_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 ; RV64ZVE32F-NEXT: vse16.v v11, (a2) +; RV64ZVE32F-NEXT: .LBB18_6: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB18_6 -; RV64ZVE32F-NEXT: .LBB18_13: # %cond.store5 +; RV64ZVE32F-NEXT: beqz a2, .LBB18_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3 -; RV64ZVE32F-NEXT: vse16.v v9, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB18_8: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB18_7 -; RV64ZVE32F-NEXT: .LBB18_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB18_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 -; RV64ZVE32F-NEXT: vse16.v v9, (a2) +; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB18_10: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB18_8 -; RV64ZVE32F-NEXT: j .LBB18_9 +; RV64ZVE32F-NEXT: beqz a2, .LBB18_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB18_12: # %else10 +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB18_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB18_16 +; RV64ZVE32F-NEXT: .LBB18_14: # %else14 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB18_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 1 @@ -1106,7 +1092,7 @@ define void @mscatter_baseidx_v8i8_v8i16(<8 x i16> %val, ptr %base, <8 x i8> %id ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV64ZVE32F-NEXT: vse16.v v10, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB18_11 +; RV64ZVE32F-NEXT: beqz a1, .LBB18_14 ; RV64ZVE32F-NEXT: .LBB18_16: # %cond.store13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 @@ -1166,71 +1152,64 @@ define void @mscatter_baseidx_sext_v8i8_v8i16(<8 x i16> %val, ptr %base, <8 x i8 ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV64ZVE32F-NEXT: vse16.v v10, (a2) ; RV64ZVE32F-NEXT: .LBB19_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB19_12 -; RV64ZVE32F-NEXT: # %bb.5: # %else4 -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB19_13 -; RV64ZVE32F-NEXT: .LBB19_6: # %else6 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB19_14 -; RV64ZVE32F-NEXT: .LBB19_7: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB19_9 -; RV64ZVE32F-NEXT: .LBB19_8: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 5 -; RV64ZVE32F-NEXT: vse16.v v9, (a2) -; RV64ZVE32F-NEXT: .LBB19_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB19_15 -; RV64ZVE32F-NEXT: # %bb.10: # %else12 -; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: bnez a1, .LBB19_16 -; RV64ZVE32F-NEXT: .LBB19_11: # %else14 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB19_12: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB19_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 ; RV64ZVE32F-NEXT: vse16.v v11, (a2) +; RV64ZVE32F-NEXT: .LBB19_6: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB19_6 -; RV64ZVE32F-NEXT: .LBB19_13: # %cond.store5 +; RV64ZVE32F-NEXT: beqz a2, .LBB19_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3 -; RV64ZVE32F-NEXT: vse16.v v9, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB19_8: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB19_7 -; RV64ZVE32F-NEXT: .LBB19_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB19_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 -; RV64ZVE32F-NEXT: vse16.v v9, (a2) +; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB19_10: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB19_8 -; RV64ZVE32F-NEXT: j .LBB19_9 +; RV64ZVE32F-NEXT: beqz a2, .LBB19_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB19_12: # %else10 +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB19_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB19_16 +; RV64ZVE32F-NEXT: .LBB19_14: # %else14 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB19_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 1 @@ -1239,7 +1218,7 @@ define void @mscatter_baseidx_sext_v8i8_v8i16(<8 x i16> %val, ptr %base, <8 x i8 ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV64ZVE32F-NEXT: vse16.v v10, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB19_11 +; RV64ZVE32F-NEXT: beqz a1, .LBB19_14 ; RV64ZVE32F-NEXT: .LBB19_16: # %cond.store13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 @@ -1300,75 +1279,68 @@ define void @mscatter_baseidx_zext_v8i8_v8i16(<8 x i16> %val, ptr %base, <8 x i8 ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV64ZVE32F-NEXT: vse16.v v10, (a2) ; RV64ZVE32F-NEXT: .LBB20_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB20_12 -; RV64ZVE32F-NEXT: # %bb.5: # %else4 -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB20_13 -; RV64ZVE32F-NEXT: .LBB20_6: # %else6 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB20_14 -; RV64ZVE32F-NEXT: .LBB20_7: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB20_9 -; RV64ZVE32F-NEXT: .LBB20_8: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: zext.b a2, a2 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 5 -; RV64ZVE32F-NEXT: vse16.v v9, (a2) -; RV64ZVE32F-NEXT: .LBB20_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB20_15 -; RV64ZVE32F-NEXT: # %bb.10: # %else12 -; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: bnez a1, .LBB20_16 -; RV64ZVE32F-NEXT: .LBB20_11: # %else14 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB20_12: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB20_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: zext.b a2, a2 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 ; RV64ZVE32F-NEXT: vse16.v v11, (a2) +; RV64ZVE32F-NEXT: .LBB20_6: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB20_6 -; RV64ZVE32F-NEXT: .LBB20_13: # %cond.store5 +; RV64ZVE32F-NEXT: beqz a2, .LBB20_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: zext.b a2, a2 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3 -; RV64ZVE32F-NEXT: vse16.v v9, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB20_8: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB20_7 -; RV64ZVE32F-NEXT: .LBB20_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB20_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: zext.b a2, a2 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 -; RV64ZVE32F-NEXT: vse16.v v9, (a2) +; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB20_10: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB20_8 -; RV64ZVE32F-NEXT: j .LBB20_9 +; RV64ZVE32F-NEXT: beqz a2, .LBB20_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: zext.b a2, a2 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB20_12: # %else10 +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB20_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB20_16 +; RV64ZVE32F-NEXT: .LBB20_14: # %else14 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB20_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: zext.b a2, a2 @@ -1378,7 +1350,7 @@ define void @mscatter_baseidx_zext_v8i8_v8i16(<8 x i16> %val, ptr %base, <8 x i8 ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV64ZVE32F-NEXT: vse16.v v10, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB20_11 +; RV64ZVE32F-NEXT: beqz a1, .LBB20_14 ; RV64ZVE32F-NEXT: .LBB20_16: # %cond.store13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 @@ -1438,70 +1410,63 @@ define void @mscatter_baseidx_v8i16(<8 x i16> %val, ptr %base, <8 x i16> %idxs, ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV64ZVE32F-NEXT: vse16.v v10, (a2) ; RV64ZVE32F-NEXT: .LBB21_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB21_12 -; RV64ZVE32F-NEXT: # %bb.5: # %else4 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB21_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 +; RV64ZVE32F-NEXT: vse16.v v11, (a2) +; RV64ZVE32F-NEXT: .LBB21_6: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB21_13 -; RV64ZVE32F-NEXT: .LBB21_6: # %else6 +; RV64ZVE32F-NEXT: beqz a2, .LBB21_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB21_8: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB21_14 -; RV64ZVE32F-NEXT: .LBB21_7: # %else8 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB21_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB21_10: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB21_9 -; RV64ZVE32F-NEXT: .LBB21_8: # %cond.store9 +; RV64ZVE32F-NEXT: beqz a2, .LBB21_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 5 -; RV64ZVE32F-NEXT: vse16.v v9, (a2) -; RV64ZVE32F-NEXT: .LBB21_9: # %else10 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB21_12: # %else10 ; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB21_15 -; RV64ZVE32F-NEXT: # %bb.10: # %else12 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB21_16 -; RV64ZVE32F-NEXT: .LBB21_11: # %else14 +; RV64ZVE32F-NEXT: .LBB21_14: # %else14 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB21_12: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 -; RV64ZVE32F-NEXT: vse16.v v11, (a2) -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB21_6 -; RV64ZVE32F-NEXT: .LBB21_13: # %cond.store5 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3 -; RV64ZVE32F-NEXT: vse16.v v9, (a2) -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB21_7 -; RV64ZVE32F-NEXT: .LBB21_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 -; RV64ZVE32F-NEXT: vse16.v v9, (a2) -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB21_8 -; RV64ZVE32F-NEXT: j .LBB21_9 ; RV64ZVE32F-NEXT: .LBB21_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 1 @@ -1510,7 +1475,7 @@ define void @mscatter_baseidx_v8i16(<8 x i16> %val, ptr %base, <8 x i16> %idxs, ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV64ZVE32F-NEXT: vse16.v v10, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB21_11 +; RV64ZVE32F-NEXT: beqz a1, .LBB21_14 ; RV64ZVE32F-NEXT: .LBB21_16: # %cond.store13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 @@ -1913,74 +1878,66 @@ define void @mscatter_baseidx_v8i8_v8i32(<8 x i32> %val, ptr %base, <8 x i8> %id ; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 1 ; RV64ZVE32F-NEXT: vse32.v v11, (a2) ; RV64ZVE32F-NEXT: .LBB29_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB29_12 -; RV64ZVE32F-NEXT: # %bb.5: # %else4 -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB29_13 -; RV64ZVE32F-NEXT: .LBB29_6: # %else6 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB29_14 -; RV64ZVE32F-NEXT: .LBB29_7: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB29_9 -; RV64ZVE32F-NEXT: .LBB29_8: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vse32.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB29_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB29_15 -; RV64ZVE32F-NEXT: # %bb.10: # %else12 -; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: bnez a1, .LBB29_16 -; RV64ZVE32F-NEXT: .LBB29_11: # %else14 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB29_12: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB29_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB29_6: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB29_6 -; RV64ZVE32F-NEXT: .LBB29_13: # %cond.store5 +; RV64ZVE32F-NEXT: beqz a2, .LBB29_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 -; RV64ZVE32F-NEXT: vse32.v v10, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 3 +; RV64ZVE32F-NEXT: vse32.v v11, (a2) +; RV64ZVE32F-NEXT: .LBB29_8: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB29_7 -; RV64ZVE32F-NEXT: .LBB29_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB29_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB29_10: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB29_8 -; RV64ZVE32F-NEXT: j .LBB29_9 +; RV64ZVE32F-NEXT: beqz a2, .LBB29_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB29_12: # %else10 +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB29_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB29_16 +; RV64ZVE32F-NEXT: .LBB29_14: # %else14 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB29_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 @@ -1990,7 +1947,7 @@ define void @mscatter_baseidx_v8i8_v8i32(<8 x i32> %val, ptr %base, <8 x i8> %id ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB29_11 +; RV64ZVE32F-NEXT: beqz a1, .LBB29_14 ; RV64ZVE32F-NEXT: .LBB29_16: # %cond.store13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 @@ -2050,74 +2007,66 @@ define void @mscatter_baseidx_sext_v8i8_v8i32(<8 x i32> %val, ptr %base, <8 x i8 ; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 1 ; RV64ZVE32F-NEXT: vse32.v v11, (a2) ; RV64ZVE32F-NEXT: .LBB30_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB30_12 -; RV64ZVE32F-NEXT: # %bb.5: # %else4 -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB30_13 -; RV64ZVE32F-NEXT: .LBB30_6: # %else6 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB30_14 -; RV64ZVE32F-NEXT: .LBB30_7: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB30_9 -; RV64ZVE32F-NEXT: .LBB30_8: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vse32.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB30_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB30_15 -; RV64ZVE32F-NEXT: # %bb.10: # %else12 -; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: bnez a1, .LBB30_16 -; RV64ZVE32F-NEXT: .LBB30_11: # %else14 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB30_12: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB30_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB30_6: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB30_6 -; RV64ZVE32F-NEXT: .LBB30_13: # %cond.store5 +; RV64ZVE32F-NEXT: beqz a2, .LBB30_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 -; RV64ZVE32F-NEXT: vse32.v v10, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 3 +; RV64ZVE32F-NEXT: vse32.v v11, (a2) +; RV64ZVE32F-NEXT: .LBB30_8: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB30_7 -; RV64ZVE32F-NEXT: .LBB30_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB30_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB30_10: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB30_8 -; RV64ZVE32F-NEXT: j .LBB30_9 +; RV64ZVE32F-NEXT: beqz a2, .LBB30_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB30_12: # %else10 +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB30_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB30_16 +; RV64ZVE32F-NEXT: .LBB30_14: # %else14 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB30_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 @@ -2127,7 +2076,7 @@ define void @mscatter_baseidx_sext_v8i8_v8i32(<8 x i32> %val, ptr %base, <8 x i8 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB30_11 +; RV64ZVE32F-NEXT: beqz a1, .LBB30_14 ; RV64ZVE32F-NEXT: .LBB30_16: # %cond.store13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 @@ -2191,44 +2140,12 @@ define void @mscatter_baseidx_zext_v8i8_v8i32(<8 x i32> %val, ptr %base, <8 x i8 ; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 1 ; RV64ZVE32F-NEXT: vse32.v v11, (a2) ; RV64ZVE32F-NEXT: .LBB31_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB31_12 -; RV64ZVE32F-NEXT: # %bb.5: # %else4 -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB31_13 -; RV64ZVE32F-NEXT: .LBB31_6: # %else6 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB31_14 -; RV64ZVE32F-NEXT: .LBB31_7: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB31_9 -; RV64ZVE32F-NEXT: .LBB31_8: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: zext.b a2, a2 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vse32.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB31_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB31_15 -; RV64ZVE32F-NEXT: # %bb.10: # %else12 -; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: bnez a1, .LBB31_16 -; RV64ZVE32F-NEXT: .LBB31_11: # %else14 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB31_12: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB31_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: zext.b a2, a2 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -2236,33 +2153,57 @@ define void @mscatter_baseidx_zext_v8i8_v8i32(<8 x i32> %val, ptr %base, <8 x i8 ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB31_6: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB31_6 -; RV64ZVE32F-NEXT: .LBB31_13: # %cond.store5 +; RV64ZVE32F-NEXT: beqz a2, .LBB31_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: zext.b a2, a2 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 -; RV64ZVE32F-NEXT: vse32.v v10, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 3 +; RV64ZVE32F-NEXT: vse32.v v11, (a2) +; RV64ZVE32F-NEXT: .LBB31_8: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB31_7 -; RV64ZVE32F-NEXT: .LBB31_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB31_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: zext.b a2, a2 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB31_10: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB31_8 -; RV64ZVE32F-NEXT: j .LBB31_9 +; RV64ZVE32F-NEXT: beqz a2, .LBB31_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: zext.b a2, a2 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB31_12: # %else10 +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB31_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB31_16 +; RV64ZVE32F-NEXT: .LBB31_14: # %else14 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB31_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: zext.b a2, a2 @@ -2273,7 +2214,7 @@ define void @mscatter_baseidx_zext_v8i8_v8i32(<8 x i32> %val, ptr %base, <8 x i8 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB31_11 +; RV64ZVE32F-NEXT: beqz a1, .LBB31_14 ; RV64ZVE32F-NEXT: .LBB31_16: # %cond.store13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 @@ -2337,74 +2278,66 @@ define void @mscatter_baseidx_v8i16_v8i32(<8 x i32> %val, ptr %base, <8 x i16> % ; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 1 ; RV64ZVE32F-NEXT: vse32.v v11, (a2) ; RV64ZVE32F-NEXT: .LBB32_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB32_12 -; RV64ZVE32F-NEXT: # %bb.5: # %else4 -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB32_13 -; RV64ZVE32F-NEXT: .LBB32_6: # %else6 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB32_14 -; RV64ZVE32F-NEXT: .LBB32_7: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB32_9 -; RV64ZVE32F-NEXT: .LBB32_8: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vse32.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB32_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB32_15 -; RV64ZVE32F-NEXT: # %bb.10: # %else12 -; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: bnez a1, .LBB32_16 -; RV64ZVE32F-NEXT: .LBB32_11: # %else14 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB32_12: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB32_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB32_6: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB32_6 -; RV64ZVE32F-NEXT: .LBB32_13: # %cond.store5 +; RV64ZVE32F-NEXT: beqz a2, .LBB32_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 -; RV64ZVE32F-NEXT: vse32.v v10, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 3 +; RV64ZVE32F-NEXT: vse32.v v11, (a2) +; RV64ZVE32F-NEXT: .LBB32_8: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB32_7 -; RV64ZVE32F-NEXT: .LBB32_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB32_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB32_10: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB32_8 -; RV64ZVE32F-NEXT: j .LBB32_9 +; RV64ZVE32F-NEXT: beqz a2, .LBB32_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB32_12: # %else10 +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB32_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB32_16 +; RV64ZVE32F-NEXT: .LBB32_14: # %else14 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB32_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 @@ -2414,7 +2347,7 @@ define void @mscatter_baseidx_v8i16_v8i32(<8 x i32> %val, ptr %base, <8 x i16> % ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB32_11 +; RV64ZVE32F-NEXT: beqz a1, .LBB32_14 ; RV64ZVE32F-NEXT: .LBB32_16: # %cond.store13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 @@ -2476,74 +2409,66 @@ define void @mscatter_baseidx_sext_v8i16_v8i32(<8 x i32> %val, ptr %base, <8 x i ; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 1 ; RV64ZVE32F-NEXT: vse32.v v11, (a2) ; RV64ZVE32F-NEXT: .LBB33_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB33_12 -; RV64ZVE32F-NEXT: # %bb.5: # %else4 -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB33_13 -; RV64ZVE32F-NEXT: .LBB33_6: # %else6 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB33_14 -; RV64ZVE32F-NEXT: .LBB33_7: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB33_9 -; RV64ZVE32F-NEXT: .LBB33_8: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB33_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vse32.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB33_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB33_15 -; RV64ZVE32F-NEXT: # %bb.10: # %else12 -; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: bnez a1, .LBB33_16 -; RV64ZVE32F-NEXT: .LBB33_11: # %else14 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB33_12: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB33_6: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB33_6 -; RV64ZVE32F-NEXT: .LBB33_13: # %cond.store5 +; RV64ZVE32F-NEXT: beqz a2, .LBB33_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 -; RV64ZVE32F-NEXT: vse32.v v10, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 3 +; RV64ZVE32F-NEXT: vse32.v v11, (a2) +; RV64ZVE32F-NEXT: .LBB33_8: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB33_7 -; RV64ZVE32F-NEXT: .LBB33_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB33_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB33_10: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB33_8 -; RV64ZVE32F-NEXT: j .LBB33_9 +; RV64ZVE32F-NEXT: beqz a2, .LBB33_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB33_12: # %else10 +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB33_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB33_16 +; RV64ZVE32F-NEXT: .LBB33_14: # %else14 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB33_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 @@ -2553,7 +2478,7 @@ define void @mscatter_baseidx_sext_v8i16_v8i32(<8 x i32> %val, ptr %base, <8 x i ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB33_11 +; RV64ZVE32F-NEXT: beqz a1, .LBB33_14 ; RV64ZVE32F-NEXT: .LBB33_16: # %cond.store13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 @@ -2618,44 +2543,12 @@ define void @mscatter_baseidx_zext_v8i16_v8i32(<8 x i32> %val, ptr %base, <8 x i ; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 1 ; RV64ZVE32F-NEXT: vse32.v v11, (a2) ; RV64ZVE32F-NEXT: .LBB34_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB34_12 -; RV64ZVE32F-NEXT: # %bb.5: # %else4 -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB34_13 -; RV64ZVE32F-NEXT: .LBB34_6: # %else6 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB34_14 -; RV64ZVE32F-NEXT: .LBB34_7: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB34_9 -; RV64ZVE32F-NEXT: .LBB34_8: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: slli a2, a2, 48 -; RV64ZVE32F-NEXT: srli a2, a2, 46 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vse32.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB34_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB34_15 -; RV64ZVE32F-NEXT: # %bb.10: # %else12 -; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: bnez a1, .LBB34_16 -; RV64ZVE32F-NEXT: .LBB34_11: # %else14 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB34_12: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB34_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 48 ; RV64ZVE32F-NEXT: srli a2, a2, 46 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -2663,23 +2556,26 @@ define void @mscatter_baseidx_zext_v8i16_v8i32(<8 x i32> %val, ptr %base, <8 x i ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB34_6: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB34_6 -; RV64ZVE32F-NEXT: .LBB34_13: # %cond.store5 +; RV64ZVE32F-NEXT: beqz a2, .LBB34_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 48 ; RV64ZVE32F-NEXT: srli a2, a2, 46 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 -; RV64ZVE32F-NEXT: vse32.v v10, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 3 +; RV64ZVE32F-NEXT: vse32.v v11, (a2) +; RV64ZVE32F-NEXT: .LBB34_8: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB34_7 -; RV64ZVE32F-NEXT: .LBB34_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB34_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 48 ; RV64ZVE32F-NEXT: srli a2, a2, 46 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -2687,9 +2583,30 @@ define void @mscatter_baseidx_zext_v8i16_v8i32(<8 x i32> %val, ptr %base, <8 x i ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB34_10: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB34_8 -; RV64ZVE32F-NEXT: j .LBB34_9 +; RV64ZVE32F-NEXT: beqz a2, .LBB34_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 48 +; RV64ZVE32F-NEXT: srli a2, a2, 46 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB34_12: # %else10 +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB34_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB34_16 +; RV64ZVE32F-NEXT: .LBB34_14: # %else14 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB34_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 48 @@ -2700,7 +2617,7 @@ define void @mscatter_baseidx_zext_v8i16_v8i32(<8 x i32> %val, ptr %base, <8 x i ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB34_11 +; RV64ZVE32F-NEXT: beqz a1, .LBB34_14 ; RV64ZVE32F-NEXT: .LBB34_16: # %cond.store13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 @@ -2759,71 +2676,69 @@ define void @mscatter_baseidx_v8i32(<8 x i32> %val, ptr %base, <8 x i32> %idxs, ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: .LBB35_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB35_12 -; RV64ZVE32F-NEXT: # %bb.5: # %else4 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB35_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vslidedown.vi v13, v8, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vse32.v v13, (a2) +; RV64ZVE32F-NEXT: .LBB35_6: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB35_13 -; RV64ZVE32F-NEXT: .LBB35_6: # %else6 +; RV64ZVE32F-NEXT: # %bb.7: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: bnez a2, .LBB35_14 -; RV64ZVE32F-NEXT: .LBB35_7: # %else8 +; RV64ZVE32F-NEXT: .LBB35_8: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB35_9 -; RV64ZVE32F-NEXT: .LBB35_8: # %cond.store9 +; RV64ZVE32F-NEXT: beqz a2, .LBB35_10 +; RV64ZVE32F-NEXT: .LBB35_9: # %cond.store9 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v12, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vse32.v v10, (a2) -; RV64ZVE32F-NEXT: .LBB35_9: # %else10 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB35_10: # %else10 ; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v12, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB35_15 -; RV64ZVE32F-NEXT: # %bb.10: # %else12 +; RV64ZVE32F-NEXT: # %bb.11: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB35_16 -; RV64ZVE32F-NEXT: .LBB35_11: # %else14 +; RV64ZVE32F-NEXT: .LBB35_12: # %else14 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB35_12: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vse32.v v11, (a2) -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB35_6 ; RV64ZVE32F-NEXT: .LBB35_13: # %cond.store5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 -; RV64ZVE32F-NEXT: vse32.v v10, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 3 +; RV64ZVE32F-NEXT: vse32.v v11, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB35_7 +; RV64ZVE32F-NEXT: beqz a2, .LBB35_8 ; RV64ZVE32F-NEXT: .LBB35_14: # %cond.store7 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vse32.v v10, (a2) +; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB35_8 -; RV64ZVE32F-NEXT: j .LBB35_9 +; RV64ZVE32F-NEXT: bnez a2, .LBB35_9 +; RV64ZVE32F-NEXT: j .LBB35_10 ; RV64ZVE32F-NEXT: .LBB35_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 @@ -2833,7 +2748,7 @@ define void @mscatter_baseidx_v8i32(<8 x i32> %val, ptr %base, <8 x i32> %idxs, ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB35_11 +; RV64ZVE32F-NEXT: beqz a1, .LBB35_12 ; RV64ZVE32F-NEXT: .LBB35_16: # %cond.store13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 @@ -3530,7 +3445,7 @@ define void @mscatter_baseidx_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8> %id ; ; RV64ZVE32F-LABEL: mscatter_baseidx_v8i8_v8i64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: ld a4, 40(a0) +; RV64ZVE32F-NEXT: ld a5, 40(a0) ; RV64ZVE32F-NEXT: ld a3, 48(a0) ; RV64ZVE32F-NEXT: ld a2, 56(a0) ; RV64ZVE32F-NEXT: ld t1, 8(a0) @@ -3538,8 +3453,8 @@ define void @mscatter_baseidx_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8> %id ; RV64ZVE32F-NEXT: ld a7, 24(a0) ; RV64ZVE32F-NEXT: ld a6, 32(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a5, v0 -; RV64ZVE32F-NEXT: andi t2, a5, 1 +; RV64ZVE32F-NEXT: vmv.x.s a4, v0 +; RV64ZVE32F-NEXT: andi t2, a4, 1 ; RV64ZVE32F-NEXT: beqz t2, .LBB42_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store ; RV64ZVE32F-NEXT: ld a0, 0(a0) @@ -3548,7 +3463,7 @@ define void @mscatter_baseidx_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8> %id ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: sd a0, 0(t2) ; RV64ZVE32F-NEXT: .LBB42_2: # %else -; RV64ZVE32F-NEXT: andi a0, a5, 2 +; RV64ZVE32F-NEXT: andi a0, a4, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB42_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma @@ -3558,66 +3473,61 @@ define void @mscatter_baseidx_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8> %id ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd t1, 0(a0) ; RV64ZVE32F-NEXT: .LBB42_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 -; RV64ZVE32F-NEXT: andi a0, a5, 4 +; RV64ZVE32F-NEXT: andi a0, a4, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a0, .LBB42_12 -; RV64ZVE32F-NEXT: # %bb.5: # %else4 -; RV64ZVE32F-NEXT: andi a0, a5, 8 -; RV64ZVE32F-NEXT: bnez a0, .LBB42_13 -; RV64ZVE32F-NEXT: .LBB42_6: # %else6 -; RV64ZVE32F-NEXT: andi a0, a5, 16 -; RV64ZVE32F-NEXT: bnez a0, .LBB42_14 -; RV64ZVE32F-NEXT: .LBB42_7: # %else8 -; RV64ZVE32F-NEXT: andi a0, a5, 32 -; RV64ZVE32F-NEXT: beqz a0, .LBB42_9 -; RV64ZVE32F-NEXT: .LBB42_8: # %cond.store9 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB42_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 -; RV64ZVE32F-NEXT: sd a4, 0(a0) -; RV64ZVE32F-NEXT: .LBB42_9: # %else10 -; RV64ZVE32F-NEXT: andi a0, a5, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 -; RV64ZVE32F-NEXT: bnez a0, .LBB42_15 -; RV64ZVE32F-NEXT: # %bb.10: # %else12 -; RV64ZVE32F-NEXT: andi a0, a5, -128 -; RV64ZVE32F-NEXT: bnez a0, .LBB42_16 -; RV64ZVE32F-NEXT: .LBB42_11: # %else14 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB42_12: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: sd t0, 0(a0) +; RV64ZVE32F-NEXT: .LBB42_6: # %else4 +; RV64ZVE32F-NEXT: andi a0, a4, 8 +; RV64ZVE32F-NEXT: beqz a0, .LBB42_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 -; RV64ZVE32F-NEXT: sd t0, 0(a0) -; RV64ZVE32F-NEXT: andi a0, a5, 8 -; RV64ZVE32F-NEXT: beqz a0, .LBB42_6 -; RV64ZVE32F-NEXT: .LBB42_13: # %cond.store5 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: sd a7, 0(a0) +; RV64ZVE32F-NEXT: .LBB42_8: # %else6 +; RV64ZVE32F-NEXT: andi a0, a4, 16 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz a0, .LBB42_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 -; RV64ZVE32F-NEXT: sd a7, 0(a0) -; RV64ZVE32F-NEXT: andi a0, a5, 16 -; RV64ZVE32F-NEXT: beqz a0, .LBB42_7 -; RV64ZVE32F-NEXT: .LBB42_14: # %cond.store7 +; RV64ZVE32F-NEXT: sd a6, 0(a0) +; RV64ZVE32F-NEXT: .LBB42_10: # %else8 +; RV64ZVE32F-NEXT: andi a0, a4, 32 +; RV64ZVE32F-NEXT: beqz a0, .LBB42_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a0, v9 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 -; RV64ZVE32F-NEXT: sd a6, 0(a0) -; RV64ZVE32F-NEXT: andi a0, a5, 32 -; RV64ZVE32F-NEXT: bnez a0, .LBB42_8 -; RV64ZVE32F-NEXT: j .LBB42_9 +; RV64ZVE32F-NEXT: sd a5, 0(a0) +; RV64ZVE32F-NEXT: .LBB42_12: # %else10 +; RV64ZVE32F-NEXT: andi a0, a4, 64 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB42_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-NEXT: andi a0, a4, -128 +; RV64ZVE32F-NEXT: bnez a0, .LBB42_16 +; RV64ZVE32F-NEXT: .LBB42_14: # %else14 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB42_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a3, 0(a0) -; RV64ZVE32F-NEXT: andi a0, a5, -128 -; RV64ZVE32F-NEXT: beqz a0, .LBB42_11 +; RV64ZVE32F-NEXT: andi a0, a4, -128 +; RV64ZVE32F-NEXT: beqz a0, .LBB42_14 ; RV64ZVE32F-NEXT: .LBB42_16: # %cond.store13 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 @@ -3780,7 +3690,7 @@ define void @mscatter_baseidx_sext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8 ; ; RV64ZVE32F-LABEL: mscatter_baseidx_sext_v8i8_v8i64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: ld a4, 40(a0) +; RV64ZVE32F-NEXT: ld a5, 40(a0) ; RV64ZVE32F-NEXT: ld a3, 48(a0) ; RV64ZVE32F-NEXT: ld a2, 56(a0) ; RV64ZVE32F-NEXT: ld t1, 8(a0) @@ -3788,8 +3698,8 @@ define void @mscatter_baseidx_sext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8 ; RV64ZVE32F-NEXT: ld a7, 24(a0) ; RV64ZVE32F-NEXT: ld a6, 32(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a5, v0 -; RV64ZVE32F-NEXT: andi t2, a5, 1 +; RV64ZVE32F-NEXT: vmv.x.s a4, v0 +; RV64ZVE32F-NEXT: andi t2, a4, 1 ; RV64ZVE32F-NEXT: beqz t2, .LBB43_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store ; RV64ZVE32F-NEXT: ld a0, 0(a0) @@ -3798,7 +3708,7 @@ define void @mscatter_baseidx_sext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8 ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: sd a0, 0(t2) ; RV64ZVE32F-NEXT: .LBB43_2: # %else -; RV64ZVE32F-NEXT: andi a0, a5, 2 +; RV64ZVE32F-NEXT: andi a0, a4, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB43_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma @@ -3808,66 +3718,61 @@ define void @mscatter_baseidx_sext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd t1, 0(a0) ; RV64ZVE32F-NEXT: .LBB43_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 -; RV64ZVE32F-NEXT: andi a0, a5, 4 +; RV64ZVE32F-NEXT: andi a0, a4, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a0, .LBB43_12 -; RV64ZVE32F-NEXT: # %bb.5: # %else4 -; RV64ZVE32F-NEXT: andi a0, a5, 8 -; RV64ZVE32F-NEXT: bnez a0, .LBB43_13 -; RV64ZVE32F-NEXT: .LBB43_6: # %else6 -; RV64ZVE32F-NEXT: andi a0, a5, 16 -; RV64ZVE32F-NEXT: bnez a0, .LBB43_14 -; RV64ZVE32F-NEXT: .LBB43_7: # %else8 -; RV64ZVE32F-NEXT: andi a0, a5, 32 -; RV64ZVE32F-NEXT: beqz a0, .LBB43_9 -; RV64ZVE32F-NEXT: .LBB43_8: # %cond.store9 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB43_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 -; RV64ZVE32F-NEXT: sd a4, 0(a0) -; RV64ZVE32F-NEXT: .LBB43_9: # %else10 -; RV64ZVE32F-NEXT: andi a0, a5, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 -; RV64ZVE32F-NEXT: bnez a0, .LBB43_15 -; RV64ZVE32F-NEXT: # %bb.10: # %else12 -; RV64ZVE32F-NEXT: andi a0, a5, -128 -; RV64ZVE32F-NEXT: bnez a0, .LBB43_16 -; RV64ZVE32F-NEXT: .LBB43_11: # %else14 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB43_12: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: sd t0, 0(a0) +; RV64ZVE32F-NEXT: .LBB43_6: # %else4 +; RV64ZVE32F-NEXT: andi a0, a4, 8 +; RV64ZVE32F-NEXT: beqz a0, .LBB43_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 -; RV64ZVE32F-NEXT: sd t0, 0(a0) -; RV64ZVE32F-NEXT: andi a0, a5, 8 -; RV64ZVE32F-NEXT: beqz a0, .LBB43_6 -; RV64ZVE32F-NEXT: .LBB43_13: # %cond.store5 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: sd a7, 0(a0) +; RV64ZVE32F-NEXT: .LBB43_8: # %else6 +; RV64ZVE32F-NEXT: andi a0, a4, 16 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz a0, .LBB43_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 -; RV64ZVE32F-NEXT: sd a7, 0(a0) -; RV64ZVE32F-NEXT: andi a0, a5, 16 -; RV64ZVE32F-NEXT: beqz a0, .LBB43_7 -; RV64ZVE32F-NEXT: .LBB43_14: # %cond.store7 +; RV64ZVE32F-NEXT: sd a6, 0(a0) +; RV64ZVE32F-NEXT: .LBB43_10: # %else8 +; RV64ZVE32F-NEXT: andi a0, a4, 32 +; RV64ZVE32F-NEXT: beqz a0, .LBB43_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a0, v9 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 -; RV64ZVE32F-NEXT: sd a6, 0(a0) -; RV64ZVE32F-NEXT: andi a0, a5, 32 -; RV64ZVE32F-NEXT: bnez a0, .LBB43_8 -; RV64ZVE32F-NEXT: j .LBB43_9 +; RV64ZVE32F-NEXT: sd a5, 0(a0) +; RV64ZVE32F-NEXT: .LBB43_12: # %else10 +; RV64ZVE32F-NEXT: andi a0, a4, 64 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB43_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-NEXT: andi a0, a4, -128 +; RV64ZVE32F-NEXT: bnez a0, .LBB43_16 +; RV64ZVE32F-NEXT: .LBB43_14: # %else14 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB43_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a3, 0(a0) -; RV64ZVE32F-NEXT: andi a0, a5, -128 -; RV64ZVE32F-NEXT: beqz a0, .LBB43_11 +; RV64ZVE32F-NEXT: andi a0, a4, -128 +; RV64ZVE32F-NEXT: beqz a0, .LBB43_14 ; RV64ZVE32F-NEXT: .LBB43_16: # %cond.store13 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 @@ -4062,72 +3967,67 @@ define void @mscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd t1, 0(a0) ; RV64ZVE32F-NEXT: .LBB44_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: andi a0, a4, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a0, .LBB44_12 -; RV64ZVE32F-NEXT: # %bb.5: # %else4 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB44_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 +; RV64ZVE32F-NEXT: zext.b a0, a0 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd t0, 0(a0) +; RV64ZVE32F-NEXT: .LBB44_6: # %else4 ; RV64ZVE32F-NEXT: andi a0, a4, 8 -; RV64ZVE32F-NEXT: bnez a0, .LBB44_13 -; RV64ZVE32F-NEXT: .LBB44_6: # %else6 +; RV64ZVE32F-NEXT: beqz a0, .LBB44_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 +; RV64ZVE32F-NEXT: zext.b a0, a0 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a7, 0(a0) +; RV64ZVE32F-NEXT: .LBB44_8: # %else6 ; RV64ZVE32F-NEXT: andi a0, a4, 16 -; RV64ZVE32F-NEXT: bnez a0, .LBB44_14 -; RV64ZVE32F-NEXT: .LBB44_7: # %else8 -; RV64ZVE32F-NEXT: andi a0, a4, 32 -; RV64ZVE32F-NEXT: beqz a0, .LBB44_9 -; RV64ZVE32F-NEXT: .LBB44_8: # %cond.store9 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz a0, .LBB44_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: zext.b a0, a0 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a6, 0(a0) +; RV64ZVE32F-NEXT: .LBB44_10: # %else8 +; RV64ZVE32F-NEXT: andi a0, a4, 32 +; RV64ZVE32F-NEXT: beqz a0, .LBB44_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 +; RV64ZVE32F-NEXT: zext.b a0, a0 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a5, 0(a0) -; RV64ZVE32F-NEXT: .LBB44_9: # %else10 +; RV64ZVE32F-NEXT: .LBB44_12: # %else10 ; RV64ZVE32F-NEXT: andi a0, a4, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB44_15 -; RV64ZVE32F-NEXT: # %bb.10: # %else12 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 ; RV64ZVE32F-NEXT: andi a0, a4, -128 ; RV64ZVE32F-NEXT: bnez a0, .LBB44_16 -; RV64ZVE32F-NEXT: .LBB44_11: # %else14 +; RV64ZVE32F-NEXT: .LBB44_14: # %else14 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB44_12: # %cond.store3 +; RV64ZVE32F-NEXT: .LBB44_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: zext.b a0, a0 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 -; RV64ZVE32F-NEXT: sd t0, 0(a0) -; RV64ZVE32F-NEXT: andi a0, a4, 8 -; RV64ZVE32F-NEXT: beqz a0, .LBB44_6 -; RV64ZVE32F-NEXT: .LBB44_13: # %cond.store5 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a0, v8 -; RV64ZVE32F-NEXT: zext.b a0, a0 -; RV64ZVE32F-NEXT: slli a0, a0, 3 -; RV64ZVE32F-NEXT: add a0, a1, a0 -; RV64ZVE32F-NEXT: sd a7, 0(a0) -; RV64ZVE32F-NEXT: andi a0, a4, 16 -; RV64ZVE32F-NEXT: beqz a0, .LBB44_7 -; RV64ZVE32F-NEXT: .LBB44_14: # %cond.store7 -; RV64ZVE32F-NEXT: vmv.x.s a0, v9 -; RV64ZVE32F-NEXT: zext.b a0, a0 -; RV64ZVE32F-NEXT: slli a0, a0, 3 -; RV64ZVE32F-NEXT: add a0, a1, a0 -; RV64ZVE32F-NEXT: sd a6, 0(a0) -; RV64ZVE32F-NEXT: andi a0, a4, 32 -; RV64ZVE32F-NEXT: bnez a0, .LBB44_8 -; RV64ZVE32F-NEXT: j .LBB44_9 -; RV64ZVE32F-NEXT: .LBB44_15: # %cond.store11 -; RV64ZVE32F-NEXT: vmv.x.s a0, v8 -; RV64ZVE32F-NEXT: zext.b a0, a0 -; RV64ZVE32F-NEXT: slli a0, a0, 3 -; RV64ZVE32F-NEXT: add a0, a1, a0 -; RV64ZVE32F-NEXT: sd a3, 0(a0) -; RV64ZVE32F-NEXT: andi a0, a4, -128 -; RV64ZVE32F-NEXT: beqz a0, .LBB44_11 -; RV64ZVE32F-NEXT: .LBB44_16: # %cond.store13 +; RV64ZVE32F-NEXT: sd a3, 0(a0) +; RV64ZVE32F-NEXT: andi a0, a4, -128 +; RV64ZVE32F-NEXT: beqz a0, .LBB44_14 +; RV64ZVE32F-NEXT: .LBB44_16: # %cond.store13 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: zext.b a0, a0 @@ -4292,7 +4192,7 @@ define void @mscatter_baseidx_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i16> % ; ; RV64ZVE32F-LABEL: mscatter_baseidx_v8i16_v8i64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: ld a4, 40(a0) +; RV64ZVE32F-NEXT: ld a5, 40(a0) ; RV64ZVE32F-NEXT: ld a3, 48(a0) ; RV64ZVE32F-NEXT: ld a2, 56(a0) ; RV64ZVE32F-NEXT: ld t1, 8(a0) @@ -4300,8 +4200,8 @@ define void @mscatter_baseidx_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i16> % ; RV64ZVE32F-NEXT: ld a7, 24(a0) ; RV64ZVE32F-NEXT: ld a6, 32(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a5, v0 -; RV64ZVE32F-NEXT: andi t2, a5, 1 +; RV64ZVE32F-NEXT: vmv.x.s a4, v0 +; RV64ZVE32F-NEXT: andi t2, a4, 1 ; RV64ZVE32F-NEXT: beqz t2, .LBB45_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store ; RV64ZVE32F-NEXT: ld a0, 0(a0) @@ -4311,7 +4211,7 @@ define void @mscatter_baseidx_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i16> % ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: sd a0, 0(t2) ; RV64ZVE32F-NEXT: .LBB45_2: # %else -; RV64ZVE32F-NEXT: andi a0, a5, 2 +; RV64ZVE32F-NEXT: andi a0, a4, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB45_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma @@ -4321,66 +4221,61 @@ define void @mscatter_baseidx_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i16> % ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd t1, 0(a0) ; RV64ZVE32F-NEXT: .LBB45_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 -; RV64ZVE32F-NEXT: andi a0, a5, 4 +; RV64ZVE32F-NEXT: andi a0, a4, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a0, .LBB45_12 -; RV64ZVE32F-NEXT: # %bb.5: # %else4 -; RV64ZVE32F-NEXT: andi a0, a5, 8 -; RV64ZVE32F-NEXT: bnez a0, .LBB45_13 -; RV64ZVE32F-NEXT: .LBB45_6: # %else6 -; RV64ZVE32F-NEXT: andi a0, a5, 16 -; RV64ZVE32F-NEXT: bnez a0, .LBB45_14 -; RV64ZVE32F-NEXT: .LBB45_7: # %else8 -; RV64ZVE32F-NEXT: andi a0, a5, 32 -; RV64ZVE32F-NEXT: beqz a0, .LBB45_9 -; RV64ZVE32F-NEXT: .LBB45_8: # %cond.store9 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB45_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 -; RV64ZVE32F-NEXT: sd a4, 0(a0) -; RV64ZVE32F-NEXT: .LBB45_9: # %else10 -; RV64ZVE32F-NEXT: andi a0, a5, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 -; RV64ZVE32F-NEXT: bnez a0, .LBB45_15 -; RV64ZVE32F-NEXT: # %bb.10: # %else12 -; RV64ZVE32F-NEXT: andi a0, a5, -128 -; RV64ZVE32F-NEXT: bnez a0, .LBB45_16 -; RV64ZVE32F-NEXT: .LBB45_11: # %else14 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB45_12: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: sd t0, 0(a0) +; RV64ZVE32F-NEXT: .LBB45_6: # %else4 +; RV64ZVE32F-NEXT: andi a0, a4, 8 +; RV64ZVE32F-NEXT: beqz a0, .LBB45_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 -; RV64ZVE32F-NEXT: sd t0, 0(a0) -; RV64ZVE32F-NEXT: andi a0, a5, 8 -; RV64ZVE32F-NEXT: beqz a0, .LBB45_6 -; RV64ZVE32F-NEXT: .LBB45_13: # %cond.store5 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: sd a7, 0(a0) +; RV64ZVE32F-NEXT: .LBB45_8: # %else6 +; RV64ZVE32F-NEXT: andi a0, a4, 16 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz a0, .LBB45_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 -; RV64ZVE32F-NEXT: sd a7, 0(a0) -; RV64ZVE32F-NEXT: andi a0, a5, 16 -; RV64ZVE32F-NEXT: beqz a0, .LBB45_7 -; RV64ZVE32F-NEXT: .LBB45_14: # %cond.store7 +; RV64ZVE32F-NEXT: sd a6, 0(a0) +; RV64ZVE32F-NEXT: .LBB45_10: # %else8 +; RV64ZVE32F-NEXT: andi a0, a4, 32 +; RV64ZVE32F-NEXT: beqz a0, .LBB45_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a0, v9 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 -; RV64ZVE32F-NEXT: sd a6, 0(a0) -; RV64ZVE32F-NEXT: andi a0, a5, 32 -; RV64ZVE32F-NEXT: bnez a0, .LBB45_8 -; RV64ZVE32F-NEXT: j .LBB45_9 +; RV64ZVE32F-NEXT: sd a5, 0(a0) +; RV64ZVE32F-NEXT: .LBB45_12: # %else10 +; RV64ZVE32F-NEXT: andi a0, a4, 64 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB45_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-NEXT: andi a0, a4, -128 +; RV64ZVE32F-NEXT: bnez a0, .LBB45_16 +; RV64ZVE32F-NEXT: .LBB45_14: # %else14 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB45_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a3, 0(a0) -; RV64ZVE32F-NEXT: andi a0, a5, -128 -; RV64ZVE32F-NEXT: beqz a0, .LBB45_11 +; RV64ZVE32F-NEXT: andi a0, a4, -128 +; RV64ZVE32F-NEXT: beqz a0, .LBB45_14 ; RV64ZVE32F-NEXT: .LBB45_16: # %cond.store13 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 @@ -4544,7 +4439,7 @@ define void @mscatter_baseidx_sext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i ; ; RV64ZVE32F-LABEL: mscatter_baseidx_sext_v8i16_v8i64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: ld a4, 40(a0) +; RV64ZVE32F-NEXT: ld a5, 40(a0) ; RV64ZVE32F-NEXT: ld a3, 48(a0) ; RV64ZVE32F-NEXT: ld a2, 56(a0) ; RV64ZVE32F-NEXT: ld t1, 8(a0) @@ -4552,8 +4447,8 @@ define void @mscatter_baseidx_sext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i ; RV64ZVE32F-NEXT: ld a7, 24(a0) ; RV64ZVE32F-NEXT: ld a6, 32(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a5, v0 -; RV64ZVE32F-NEXT: andi t2, a5, 1 +; RV64ZVE32F-NEXT: vmv.x.s a4, v0 +; RV64ZVE32F-NEXT: andi t2, a4, 1 ; RV64ZVE32F-NEXT: beqz t2, .LBB46_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store ; RV64ZVE32F-NEXT: ld a0, 0(a0) @@ -4563,7 +4458,7 @@ define void @mscatter_baseidx_sext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: sd a0, 0(t2) ; RV64ZVE32F-NEXT: .LBB46_2: # %else -; RV64ZVE32F-NEXT: andi a0, a5, 2 +; RV64ZVE32F-NEXT: andi a0, a4, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB46_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma @@ -4573,66 +4468,61 @@ define void @mscatter_baseidx_sext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd t1, 0(a0) ; RV64ZVE32F-NEXT: .LBB46_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 -; RV64ZVE32F-NEXT: andi a0, a5, 4 +; RV64ZVE32F-NEXT: andi a0, a4, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a0, .LBB46_12 -; RV64ZVE32F-NEXT: # %bb.5: # %else4 -; RV64ZVE32F-NEXT: andi a0, a5, 8 -; RV64ZVE32F-NEXT: bnez a0, .LBB46_13 -; RV64ZVE32F-NEXT: .LBB46_6: # %else6 -; RV64ZVE32F-NEXT: andi a0, a5, 16 -; RV64ZVE32F-NEXT: bnez a0, .LBB46_14 -; RV64ZVE32F-NEXT: .LBB46_7: # %else8 -; RV64ZVE32F-NEXT: andi a0, a5, 32 -; RV64ZVE32F-NEXT: beqz a0, .LBB46_9 -; RV64ZVE32F-NEXT: .LBB46_8: # %cond.store9 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB46_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 -; RV64ZVE32F-NEXT: sd a4, 0(a0) -; RV64ZVE32F-NEXT: .LBB46_9: # %else10 -; RV64ZVE32F-NEXT: andi a0, a5, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 -; RV64ZVE32F-NEXT: bnez a0, .LBB46_15 -; RV64ZVE32F-NEXT: # %bb.10: # %else12 -; RV64ZVE32F-NEXT: andi a0, a5, -128 -; RV64ZVE32F-NEXT: bnez a0, .LBB46_16 -; RV64ZVE32F-NEXT: .LBB46_11: # %else14 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB46_12: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: sd t0, 0(a0) +; RV64ZVE32F-NEXT: .LBB46_6: # %else4 +; RV64ZVE32F-NEXT: andi a0, a4, 8 +; RV64ZVE32F-NEXT: beqz a0, .LBB46_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 -; RV64ZVE32F-NEXT: sd t0, 0(a0) -; RV64ZVE32F-NEXT: andi a0, a5, 8 -; RV64ZVE32F-NEXT: beqz a0, .LBB46_6 -; RV64ZVE32F-NEXT: .LBB46_13: # %cond.store5 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: sd a7, 0(a0) +; RV64ZVE32F-NEXT: .LBB46_8: # %else6 +; RV64ZVE32F-NEXT: andi a0, a4, 16 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz a0, .LBB46_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 -; RV64ZVE32F-NEXT: sd a7, 0(a0) -; RV64ZVE32F-NEXT: andi a0, a5, 16 -; RV64ZVE32F-NEXT: beqz a0, .LBB46_7 -; RV64ZVE32F-NEXT: .LBB46_14: # %cond.store7 +; RV64ZVE32F-NEXT: sd a6, 0(a0) +; RV64ZVE32F-NEXT: .LBB46_10: # %else8 +; RV64ZVE32F-NEXT: andi a0, a4, 32 +; RV64ZVE32F-NEXT: beqz a0, .LBB46_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a0, v9 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 -; RV64ZVE32F-NEXT: sd a6, 0(a0) -; RV64ZVE32F-NEXT: andi a0, a5, 32 -; RV64ZVE32F-NEXT: bnez a0, .LBB46_8 -; RV64ZVE32F-NEXT: j .LBB46_9 +; RV64ZVE32F-NEXT: sd a5, 0(a0) +; RV64ZVE32F-NEXT: .LBB46_12: # %else10 +; RV64ZVE32F-NEXT: andi a0, a4, 64 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB46_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-NEXT: andi a0, a4, -128 +; RV64ZVE32F-NEXT: bnez a0, .LBB46_16 +; RV64ZVE32F-NEXT: .LBB46_14: # %else14 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB46_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a3, 0(a0) -; RV64ZVE32F-NEXT: andi a0, a5, -128 -; RV64ZVE32F-NEXT: beqz a0, .LBB46_11 +; RV64ZVE32F-NEXT: andi a0, a4, -128 +; RV64ZVE32F-NEXT: beqz a0, .LBB46_14 ; RV64ZVE32F-NEXT: .LBB46_16: # %cond.store13 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 @@ -4829,63 +4719,58 @@ define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd t1, 0(a0) ; RV64ZVE32F-NEXT: .LBB47_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: andi a0, a4, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a0, .LBB47_12 -; RV64ZVE32F-NEXT: # %bb.5: # %else4 -; RV64ZVE32F-NEXT: andi a0, a4, 8 -; RV64ZVE32F-NEXT: bnez a0, .LBB47_13 -; RV64ZVE32F-NEXT: .LBB47_6: # %else6 -; RV64ZVE32F-NEXT: andi a0, a4, 16 -; RV64ZVE32F-NEXT: bnez a0, .LBB47_14 -; RV64ZVE32F-NEXT: .LBB47_7: # %else8 -; RV64ZVE32F-NEXT: andi a0, a4, 32 -; RV64ZVE32F-NEXT: beqz a0, .LBB47_9 -; RV64ZVE32F-NEXT: .LBB47_8: # %cond.store9 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a0, v8 -; RV64ZVE32F-NEXT: slli a0, a0, 48 -; RV64ZVE32F-NEXT: srli a0, a0, 45 -; RV64ZVE32F-NEXT: add a0, a1, a0 -; RV64ZVE32F-NEXT: sd a5, 0(a0) -; RV64ZVE32F-NEXT: .LBB47_9: # %else10 -; RV64ZVE32F-NEXT: andi a0, a4, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 -; RV64ZVE32F-NEXT: bnez a0, .LBB47_15 -; RV64ZVE32F-NEXT: # %bb.10: # %else12 -; RV64ZVE32F-NEXT: andi a0, a4, -128 -; RV64ZVE32F-NEXT: bnez a0, .LBB47_16 -; RV64ZVE32F-NEXT: .LBB47_11: # %else14 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB47_12: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB47_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 ; RV64ZVE32F-NEXT: slli a0, a0, 48 ; RV64ZVE32F-NEXT: srli a0, a0, 45 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd t0, 0(a0) +; RV64ZVE32F-NEXT: .LBB47_6: # %else4 ; RV64ZVE32F-NEXT: andi a0, a4, 8 -; RV64ZVE32F-NEXT: beqz a0, .LBB47_6 -; RV64ZVE32F-NEXT: .LBB47_13: # %cond.store5 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: beqz a0, .LBB47_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 ; RV64ZVE32F-NEXT: slli a0, a0, 48 ; RV64ZVE32F-NEXT: srli a0, a0, 45 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a7, 0(a0) +; RV64ZVE32F-NEXT: .LBB47_8: # %else6 ; RV64ZVE32F-NEXT: andi a0, a4, 16 -; RV64ZVE32F-NEXT: beqz a0, .LBB47_7 -; RV64ZVE32F-NEXT: .LBB47_14: # %cond.store7 -; RV64ZVE32F-NEXT: vmv.x.s a0, v9 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz a0, .LBB47_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: slli a0, a0, 48 ; RV64ZVE32F-NEXT: srli a0, a0, 45 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a6, 0(a0) +; RV64ZVE32F-NEXT: .LBB47_10: # %else8 ; RV64ZVE32F-NEXT: andi a0, a4, 32 -; RV64ZVE32F-NEXT: bnez a0, .LBB47_8 -; RV64ZVE32F-NEXT: j .LBB47_9 +; RV64ZVE32F-NEXT: beqz a0, .LBB47_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 +; RV64ZVE32F-NEXT: slli a0, a0, 48 +; RV64ZVE32F-NEXT: srli a0, a0, 45 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a5, 0(a0) +; RV64ZVE32F-NEXT: .LBB47_12: # %else10 +; RV64ZVE32F-NEXT: andi a0, a4, 64 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB47_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-NEXT: andi a0, a4, -128 +; RV64ZVE32F-NEXT: bnez a0, .LBB47_16 +; RV64ZVE32F-NEXT: .LBB47_14: # %else14 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB47_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: slli a0, a0, 48 @@ -4893,7 +4778,7 @@ define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a3, 0(a0) ; RV64ZVE32F-NEXT: andi a0, a4, -128 -; RV64ZVE32F-NEXT: beqz a0, .LBB47_11 +; RV64ZVE32F-NEXT: beqz a0, .LBB47_14 ; RV64ZVE32F-NEXT: .LBB47_16: # %cond.store13 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 @@ -5057,7 +4942,7 @@ define void @mscatter_baseidx_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i32> % ; ; RV64ZVE32F-LABEL: mscatter_baseidx_v8i32_v8i64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: ld a4, 40(a0) +; RV64ZVE32F-NEXT: ld a5, 40(a0) ; RV64ZVE32F-NEXT: ld a3, 48(a0) ; RV64ZVE32F-NEXT: ld a2, 56(a0) ; RV64ZVE32F-NEXT: ld t1, 8(a0) @@ -5065,8 +4950,8 @@ define void @mscatter_baseidx_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i32> % ; RV64ZVE32F-NEXT: ld a7, 24(a0) ; RV64ZVE32F-NEXT: ld a6, 32(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a5, v0 -; RV64ZVE32F-NEXT: andi t2, a5, 1 +; RV64ZVE32F-NEXT: vmv.x.s a4, v0 +; RV64ZVE32F-NEXT: andi t2, a4, 1 ; RV64ZVE32F-NEXT: beqz t2, .LBB48_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store ; RV64ZVE32F-NEXT: ld a0, 0(a0) @@ -5076,7 +4961,7 @@ define void @mscatter_baseidx_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i32> % ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: sd a0, 0(t2) ; RV64ZVE32F-NEXT: .LBB48_2: # %else -; RV64ZVE32F-NEXT: andi a0, a5, 2 +; RV64ZVE32F-NEXT: andi a0, a4, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB48_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma @@ -5086,66 +4971,68 @@ define void @mscatter_baseidx_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i32> % ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd t1, 0(a0) ; RV64ZVE32F-NEXT: .LBB48_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-NEXT: andi a0, a5, 4 +; RV64ZVE32F-NEXT: andi a0, a4, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a0, .LBB48_12 -; RV64ZVE32F-NEXT: # %bb.5: # %else4 -; RV64ZVE32F-NEXT: andi a0, a5, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB48_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a0, v10 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd t0, 0(a0) +; RV64ZVE32F-NEXT: .LBB48_6: # %else4 +; RV64ZVE32F-NEXT: andi a0, a4, 8 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 ; RV64ZVE32F-NEXT: bnez a0, .LBB48_13 -; RV64ZVE32F-NEXT: .LBB48_6: # %else6 -; RV64ZVE32F-NEXT: andi a0, a5, 16 +; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: andi a0, a4, 16 ; RV64ZVE32F-NEXT: bnez a0, .LBB48_14 -; RV64ZVE32F-NEXT: .LBB48_7: # %else8 -; RV64ZVE32F-NEXT: andi a0, a5, 32 -; RV64ZVE32F-NEXT: beqz a0, .LBB48_9 -; RV64ZVE32F-NEXT: .LBB48_8: # %cond.store9 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: .LBB48_8: # %else8 +; RV64ZVE32F-NEXT: andi a0, a4, 32 +; RV64ZVE32F-NEXT: beqz a0, .LBB48_10 +; RV64ZVE32F-NEXT: .LBB48_9: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 -; RV64ZVE32F-NEXT: sd a4, 0(a0) -; RV64ZVE32F-NEXT: .LBB48_9: # %else10 -; RV64ZVE32F-NEXT: andi a0, a5, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 +; RV64ZVE32F-NEXT: sd a5, 0(a0) +; RV64ZVE32F-NEXT: .LBB48_10: # %else10 +; RV64ZVE32F-NEXT: andi a0, a4, 64 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB48_15 -; RV64ZVE32F-NEXT: # %bb.10: # %else12 -; RV64ZVE32F-NEXT: andi a0, a5, -128 +; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: andi a0, a4, -128 ; RV64ZVE32F-NEXT: bnez a0, .LBB48_16 -; RV64ZVE32F-NEXT: .LBB48_11: # %else14 +; RV64ZVE32F-NEXT: .LBB48_12: # %else14 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB48_12: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a0, v8 -; RV64ZVE32F-NEXT: slli a0, a0, 3 -; RV64ZVE32F-NEXT: add a0, a1, a0 -; RV64ZVE32F-NEXT: sd t0, 0(a0) -; RV64ZVE32F-NEXT: andi a0, a5, 8 -; RV64ZVE32F-NEXT: beqz a0, .LBB48_6 ; RV64ZVE32F-NEXT: .LBB48_13: # %cond.store5 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a7, 0(a0) -; RV64ZVE32F-NEXT: andi a0, a5, 16 -; RV64ZVE32F-NEXT: beqz a0, .LBB48_7 +; RV64ZVE32F-NEXT: andi a0, a4, 16 +; RV64ZVE32F-NEXT: beqz a0, .LBB48_8 ; RV64ZVE32F-NEXT: .LBB48_14: # %cond.store7 -; RV64ZVE32F-NEXT: vmv.x.s a0, v10 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a6, 0(a0) -; RV64ZVE32F-NEXT: andi a0, a5, 32 -; RV64ZVE32F-NEXT: bnez a0, .LBB48_8 -; RV64ZVE32F-NEXT: j .LBB48_9 +; RV64ZVE32F-NEXT: andi a0, a4, 32 +; RV64ZVE32F-NEXT: bnez a0, .LBB48_9 +; RV64ZVE32F-NEXT: j .LBB48_10 ; RV64ZVE32F-NEXT: .LBB48_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a3, 0(a0) -; RV64ZVE32F-NEXT: andi a0, a5, -128 -; RV64ZVE32F-NEXT: beqz a0, .LBB48_11 +; RV64ZVE32F-NEXT: andi a0, a4, -128 +; RV64ZVE32F-NEXT: beqz a0, .LBB48_12 ; RV64ZVE32F-NEXT: .LBB48_16: # %cond.store13 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 @@ -5307,7 +5194,7 @@ define void @mscatter_baseidx_sext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i ; ; RV64ZVE32F-LABEL: mscatter_baseidx_sext_v8i32_v8i64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: ld a4, 40(a0) +; RV64ZVE32F-NEXT: ld a5, 40(a0) ; RV64ZVE32F-NEXT: ld a3, 48(a0) ; RV64ZVE32F-NEXT: ld a2, 56(a0) ; RV64ZVE32F-NEXT: ld t1, 8(a0) @@ -5315,8 +5202,8 @@ define void @mscatter_baseidx_sext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i ; RV64ZVE32F-NEXT: ld a7, 24(a0) ; RV64ZVE32F-NEXT: ld a6, 32(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a5, v0 -; RV64ZVE32F-NEXT: andi t2, a5, 1 +; RV64ZVE32F-NEXT: vmv.x.s a4, v0 +; RV64ZVE32F-NEXT: andi t2, a4, 1 ; RV64ZVE32F-NEXT: beqz t2, .LBB49_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store ; RV64ZVE32F-NEXT: ld a0, 0(a0) @@ -5326,7 +5213,7 @@ define void @mscatter_baseidx_sext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: sd a0, 0(t2) ; RV64ZVE32F-NEXT: .LBB49_2: # %else -; RV64ZVE32F-NEXT: andi a0, a5, 2 +; RV64ZVE32F-NEXT: andi a0, a4, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB49_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma @@ -5336,66 +5223,68 @@ define void @mscatter_baseidx_sext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd t1, 0(a0) ; RV64ZVE32F-NEXT: .LBB49_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-NEXT: andi a0, a5, 4 +; RV64ZVE32F-NEXT: andi a0, a4, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a0, .LBB49_12 -; RV64ZVE32F-NEXT: # %bb.5: # %else4 -; RV64ZVE32F-NEXT: andi a0, a5, 8 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB49_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a0, v10 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd t0, 0(a0) +; RV64ZVE32F-NEXT: .LBB49_6: # %else4 +; RV64ZVE32F-NEXT: andi a0, a4, 8 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 ; RV64ZVE32F-NEXT: bnez a0, .LBB49_13 -; RV64ZVE32F-NEXT: .LBB49_6: # %else6 -; RV64ZVE32F-NEXT: andi a0, a5, 16 +; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: andi a0, a4, 16 ; RV64ZVE32F-NEXT: bnez a0, .LBB49_14 -; RV64ZVE32F-NEXT: .LBB49_7: # %else8 -; RV64ZVE32F-NEXT: andi a0, a5, 32 -; RV64ZVE32F-NEXT: beqz a0, .LBB49_9 -; RV64ZVE32F-NEXT: .LBB49_8: # %cond.store9 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: .LBB49_8: # %else8 +; RV64ZVE32F-NEXT: andi a0, a4, 32 +; RV64ZVE32F-NEXT: beqz a0, .LBB49_10 +; RV64ZVE32F-NEXT: .LBB49_9: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 -; RV64ZVE32F-NEXT: sd a4, 0(a0) -; RV64ZVE32F-NEXT: .LBB49_9: # %else10 -; RV64ZVE32F-NEXT: andi a0, a5, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 +; RV64ZVE32F-NEXT: sd a5, 0(a0) +; RV64ZVE32F-NEXT: .LBB49_10: # %else10 +; RV64ZVE32F-NEXT: andi a0, a4, 64 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB49_15 -; RV64ZVE32F-NEXT: # %bb.10: # %else12 -; RV64ZVE32F-NEXT: andi a0, a5, -128 +; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: andi a0, a4, -128 ; RV64ZVE32F-NEXT: bnez a0, .LBB49_16 -; RV64ZVE32F-NEXT: .LBB49_11: # %else14 +; RV64ZVE32F-NEXT: .LBB49_12: # %else14 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB49_12: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a0, v8 -; RV64ZVE32F-NEXT: slli a0, a0, 3 -; RV64ZVE32F-NEXT: add a0, a1, a0 -; RV64ZVE32F-NEXT: sd t0, 0(a0) -; RV64ZVE32F-NEXT: andi a0, a5, 8 -; RV64ZVE32F-NEXT: beqz a0, .LBB49_6 ; RV64ZVE32F-NEXT: .LBB49_13: # %cond.store5 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a7, 0(a0) -; RV64ZVE32F-NEXT: andi a0, a5, 16 -; RV64ZVE32F-NEXT: beqz a0, .LBB49_7 +; RV64ZVE32F-NEXT: andi a0, a4, 16 +; RV64ZVE32F-NEXT: beqz a0, .LBB49_8 ; RV64ZVE32F-NEXT: .LBB49_14: # %cond.store7 -; RV64ZVE32F-NEXT: vmv.x.s a0, v10 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a6, 0(a0) -; RV64ZVE32F-NEXT: andi a0, a5, 32 -; RV64ZVE32F-NEXT: bnez a0, .LBB49_8 -; RV64ZVE32F-NEXT: j .LBB49_9 +; RV64ZVE32F-NEXT: andi a0, a4, 32 +; RV64ZVE32F-NEXT: bnez a0, .LBB49_9 +; RV64ZVE32F-NEXT: j .LBB49_10 ; RV64ZVE32F-NEXT: .LBB49_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a3, 0(a0) -; RV64ZVE32F-NEXT: andi a0, a5, -128 -; RV64ZVE32F-NEXT: beqz a0, .LBB49_11 +; RV64ZVE32F-NEXT: andi a0, a4, -128 +; RV64ZVE32F-NEXT: beqz a0, .LBB49_12 ; RV64ZVE32F-NEXT: .LBB49_16: # %cond.store13 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 @@ -5558,7 +5447,7 @@ define void @mscatter_baseidx_zext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i ; ; RV64ZVE32F-LABEL: mscatter_baseidx_zext_v8i32_v8i64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: ld a4, 40(a0) +; RV64ZVE32F-NEXT: ld a5, 40(a0) ; RV64ZVE32F-NEXT: ld a3, 48(a0) ; RV64ZVE32F-NEXT: ld a2, 56(a0) ; RV64ZVE32F-NEXT: ld t1, 8(a0) @@ -5566,8 +5455,8 @@ define void @mscatter_baseidx_zext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i ; RV64ZVE32F-NEXT: ld a7, 24(a0) ; RV64ZVE32F-NEXT: ld a6, 32(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a5, v0 -; RV64ZVE32F-NEXT: andi t2, a5, 1 +; RV64ZVE32F-NEXT: vmv.x.s a4, v0 +; RV64ZVE32F-NEXT: andi t2, a4, 1 ; RV64ZVE32F-NEXT: beqz t2, .LBB50_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store ; RV64ZVE32F-NEXT: ld a0, 0(a0) @@ -5578,7 +5467,7 @@ define void @mscatter_baseidx_zext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: sd a0, 0(t2) ; RV64ZVE32F-NEXT: .LBB50_2: # %else -; RV64ZVE32F-NEXT: andi a0, a5, 2 +; RV64ZVE32F-NEXT: andi a0, a4, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB50_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.store1 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma @@ -5589,71 +5478,73 @@ define void @mscatter_baseidx_zext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd t1, 0(a0) ; RV64ZVE32F-NEXT: .LBB50_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-NEXT: andi a0, a5, 4 +; RV64ZVE32F-NEXT: andi a0, a4, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a0, .LBB50_12 -; RV64ZVE32F-NEXT: # %bb.5: # %else4 -; RV64ZVE32F-NEXT: andi a0, a5, 8 -; RV64ZVE32F-NEXT: bnez a0, .LBB50_13 -; RV64ZVE32F-NEXT: .LBB50_6: # %else6 -; RV64ZVE32F-NEXT: andi a0, a5, 16 -; RV64ZVE32F-NEXT: bnez a0, .LBB50_14 -; RV64ZVE32F-NEXT: .LBB50_7: # %else8 -; RV64ZVE32F-NEXT: andi a0, a5, 32 -; RV64ZVE32F-NEXT: beqz a0, .LBB50_9 -; RV64ZVE32F-NEXT: .LBB50_8: # %cond.store9 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: beqz a0, .LBB50_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a0, v10 ; RV64ZVE32F-NEXT: slli a0, a0, 32 ; RV64ZVE32F-NEXT: srli a0, a0, 29 ; RV64ZVE32F-NEXT: add a0, a1, a0 -; RV64ZVE32F-NEXT: sd a4, 0(a0) -; RV64ZVE32F-NEXT: .LBB50_9: # %else10 -; RV64ZVE32F-NEXT: andi a0, a5, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 -; RV64ZVE32F-NEXT: bnez a0, .LBB50_15 -; RV64ZVE32F-NEXT: # %bb.10: # %else12 -; RV64ZVE32F-NEXT: andi a0, a5, -128 -; RV64ZVE32F-NEXT: bnez a0, .LBB50_16 -; RV64ZVE32F-NEXT: .LBB50_11: # %else14 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB50_12: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: sd t0, 0(a0) +; RV64ZVE32F-NEXT: .LBB50_6: # %else4 +; RV64ZVE32F-NEXT: andi a0, a4, 8 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: bnez a0, .LBB50_13 +; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: andi a0, a4, 16 +; RV64ZVE32F-NEXT: bnez a0, .LBB50_14 +; RV64ZVE32F-NEXT: .LBB50_8: # %else8 +; RV64ZVE32F-NEXT: andi a0, a4, 32 +; RV64ZVE32F-NEXT: beqz a0, .LBB50_10 +; RV64ZVE32F-NEXT: .LBB50_9: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 ; RV64ZVE32F-NEXT: slli a0, a0, 32 ; RV64ZVE32F-NEXT: srli a0, a0, 29 ; RV64ZVE32F-NEXT: add a0, a1, a0 -; RV64ZVE32F-NEXT: sd t0, 0(a0) -; RV64ZVE32F-NEXT: andi a0, a5, 8 -; RV64ZVE32F-NEXT: beqz a0, .LBB50_6 +; RV64ZVE32F-NEXT: sd a5, 0(a0) +; RV64ZVE32F-NEXT: .LBB50_10: # %else10 +; RV64ZVE32F-NEXT: andi a0, a4, 64 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB50_15 +; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: andi a0, a4, -128 +; RV64ZVE32F-NEXT: bnez a0, .LBB50_16 +; RV64ZVE32F-NEXT: .LBB50_12: # %else14 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB50_13: # %cond.store5 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 ; RV64ZVE32F-NEXT: slli a0, a0, 32 ; RV64ZVE32F-NEXT: srli a0, a0, 29 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a7, 0(a0) -; RV64ZVE32F-NEXT: andi a0, a5, 16 -; RV64ZVE32F-NEXT: beqz a0, .LBB50_7 +; RV64ZVE32F-NEXT: andi a0, a4, 16 +; RV64ZVE32F-NEXT: beqz a0, .LBB50_8 ; RV64ZVE32F-NEXT: .LBB50_14: # %cond.store7 -; RV64ZVE32F-NEXT: vmv.x.s a0, v10 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: slli a0, a0, 32 ; RV64ZVE32F-NEXT: srli a0, a0, 29 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a6, 0(a0) -; RV64ZVE32F-NEXT: andi a0, a5, 32 -; RV64ZVE32F-NEXT: bnez a0, .LBB50_8 -; RV64ZVE32F-NEXT: j .LBB50_9 +; RV64ZVE32F-NEXT: andi a0, a4, 32 +; RV64ZVE32F-NEXT: bnez a0, .LBB50_9 +; RV64ZVE32F-NEXT: j .LBB50_10 ; RV64ZVE32F-NEXT: .LBB50_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: slli a0, a0, 32 ; RV64ZVE32F-NEXT: srli a0, a0, 29 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a3, 0(a0) -; RV64ZVE32F-NEXT: andi a0, a5, -128 -; RV64ZVE32F-NEXT: beqz a0, .LBB50_11 +; RV64ZVE32F-NEXT: andi a0, a4, -128 +; RV64ZVE32F-NEXT: beqz a0, .LBB50_12 ; RV64ZVE32F-NEXT: .LBB50_16: # %cond.store13 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 @@ -6348,44 +6239,12 @@ define void @mscatter_baseidx_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i8> ; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-NEXT: .LBB58_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB58_12 -; RV64ZVE32F-NEXT: # %bb.5: # %else4 -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB58_13 -; RV64ZVE32F-NEXT: .LBB58_6: # %else6 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB58_14 -; RV64ZVE32F-NEXT: .LBB58_7: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB58_9 -; RV64ZVE32F-NEXT: .LBB58_8: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 5 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 -; RV64ZVE32F-NEXT: fsh fa5, 0(a2) -; RV64ZVE32F-NEXT: .LBB58_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB58_15 -; RV64ZVE32F-NEXT: # %bb.10: # %else12 -; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: bnez a1, .LBB58_16 -; RV64ZVE32F-NEXT: .LBB58_11: # %else14 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB58_12: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB58_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 ; RV64ZVE32F-NEXT: slli a2, a2, 1 @@ -6393,34 +6252,58 @@ define void @mscatter_baseidx_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i8> ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) +; RV64ZVE32F-NEXT: .LBB58_6: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB58_6 -; RV64ZVE32F-NEXT: .LBB58_13: # %cond.store5 +; RV64ZVE32F-NEXT: beqz a2, .LBB58_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) +; RV64ZVE32F-NEXT: .LBB58_8: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB58_7 -; RV64ZVE32F-NEXT: .LBB58_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB58_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 +; RV64ZVE32F-NEXT: fsh fa5, 0(a2) +; RV64ZVE32F-NEXT: .LBB58_10: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB58_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB58_8 -; RV64ZVE32F-NEXT: j .LBB58_9 +; RV64ZVE32F-NEXT: .LBB58_12: # %else10 +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB58_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB58_16 +; RV64ZVE32F-NEXT: .LBB58_14: # %else14 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB58_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma @@ -6431,7 +6314,7 @@ define void @mscatter_baseidx_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i8> ; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB58_11 +; RV64ZVE32F-NEXT: beqz a1, .LBB58_14 ; RV64ZVE32F-NEXT: .LBB58_16: # %cond.store13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 @@ -6499,44 +6382,12 @@ define void @mscatter_baseidx_sext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 ; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-NEXT: .LBB59_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB59_12 -; RV64ZVE32F-NEXT: # %bb.5: # %else4 -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB59_13 -; RV64ZVE32F-NEXT: .LBB59_6: # %else6 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB59_14 -; RV64ZVE32F-NEXT: .LBB59_7: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB59_9 -; RV64ZVE32F-NEXT: .LBB59_8: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 5 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 -; RV64ZVE32F-NEXT: fsh fa5, 0(a2) -; RV64ZVE32F-NEXT: .LBB59_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB59_15 -; RV64ZVE32F-NEXT: # %bb.10: # %else12 -; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: bnez a1, .LBB59_16 -; RV64ZVE32F-NEXT: .LBB59_11: # %else14 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB59_12: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB59_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 ; RV64ZVE32F-NEXT: slli a2, a2, 1 @@ -6544,34 +6395,58 @@ define void @mscatter_baseidx_sext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) +; RV64ZVE32F-NEXT: .LBB59_6: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB59_6 -; RV64ZVE32F-NEXT: .LBB59_13: # %cond.store5 +; RV64ZVE32F-NEXT: beqz a2, .LBB59_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) +; RV64ZVE32F-NEXT: .LBB59_8: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB59_7 -; RV64ZVE32F-NEXT: .LBB59_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB59_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 +; RV64ZVE32F-NEXT: fsh fa5, 0(a2) +; RV64ZVE32F-NEXT: .LBB59_10: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB59_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB59_8 -; RV64ZVE32F-NEXT: j .LBB59_9 +; RV64ZVE32F-NEXT: .LBB59_12: # %else10 +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB59_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB59_16 +; RV64ZVE32F-NEXT: .LBB59_14: # %else14 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB59_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma @@ -6582,7 +6457,7 @@ define void @mscatter_baseidx_sext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 ; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB59_11 +; RV64ZVE32F-NEXT: beqz a1, .LBB59_14 ; RV64ZVE32F-NEXT: .LBB59_16: # %cond.store13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 @@ -6651,83 +6526,75 @@ define void @mscatter_baseidx_zext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 ; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-NEXT: .LBB60_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB60_12 -; RV64ZVE32F-NEXT: # %bb.5: # %else4 -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB60_13 -; RV64ZVE32F-NEXT: .LBB60_6: # %else6 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB60_14 -; RV64ZVE32F-NEXT: .LBB60_7: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB60_9 -; RV64ZVE32F-NEXT: .LBB60_8: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB60_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 5 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 ; RV64ZVE32F-NEXT: zext.b a2, a2 +; RV64ZVE32F-NEXT: vmv.x.s a3, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) -; RV64ZVE32F-NEXT: .LBB60_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB60_15 -; RV64ZVE32F-NEXT: # %bb.10: # %else12 -; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: bnez a1, .LBB60_16 -; RV64ZVE32F-NEXT: .LBB60_11: # %else14 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB60_12: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: .LBB60_6: # %else4 +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB60_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 ; RV64ZVE32F-NEXT: zext.b a2, a2 -; RV64ZVE32F-NEXT: vmv.x.s a3, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB60_6 -; RV64ZVE32F-NEXT: .LBB60_13: # %cond.store5 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: .LBB60_8: # %else6 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB60_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: zext.b a2, a2 +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB60_7 -; RV64ZVE32F-NEXT: .LBB60_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64ZVE32F-NEXT: .LBB60_10: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB60_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 ; RV64ZVE32F-NEXT: zext.b a2, a2 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB60_8 -; RV64ZVE32F-NEXT: j .LBB60_9 +; RV64ZVE32F-NEXT: .LBB60_12: # %else10 +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB60_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB60_16 +; RV64ZVE32F-NEXT: .LBB60_14: # %else14 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB60_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma @@ -6739,7 +6606,7 @@ define void @mscatter_baseidx_zext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 ; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB60_11 +; RV64ZVE32F-NEXT: beqz a1, .LBB60_14 ; RV64ZVE32F-NEXT: .LBB60_16: # %cond.store13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 @@ -6807,44 +6674,12 @@ define void @mscatter_baseidx_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i16> %id ; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-NEXT: .LBB61_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB61_12 -; RV64ZVE32F-NEXT: # %bb.5: # %else4 -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB61_13 -; RV64ZVE32F-NEXT: .LBB61_6: # %else6 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB61_14 -; RV64ZVE32F-NEXT: .LBB61_7: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB61_9 -; RV64ZVE32F-NEXT: .LBB61_8: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 5 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 -; RV64ZVE32F-NEXT: fsh fa5, 0(a2) -; RV64ZVE32F-NEXT: .LBB61_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB61_15 -; RV64ZVE32F-NEXT: # %bb.10: # %else12 -; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: bnez a1, .LBB61_16 -; RV64ZVE32F-NEXT: .LBB61_11: # %else14 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB61_12: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB61_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 ; RV64ZVE32F-NEXT: slli a2, a2, 1 @@ -6852,33 +6687,57 @@ define void @mscatter_baseidx_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i16> %id ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) +; RV64ZVE32F-NEXT: .LBB61_6: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB61_6 -; RV64ZVE32F-NEXT: .LBB61_13: # %cond.store5 +; RV64ZVE32F-NEXT: beqz a2, .LBB61_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) +; RV64ZVE32F-NEXT: .LBB61_8: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB61_7 -; RV64ZVE32F-NEXT: .LBB61_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB61_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) +; RV64ZVE32F-NEXT: .LBB61_10: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB61_8 -; RV64ZVE32F-NEXT: j .LBB61_9 +; RV64ZVE32F-NEXT: beqz a2, .LBB61_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 +; RV64ZVE32F-NEXT: fsh fa5, 0(a2) +; RV64ZVE32F-NEXT: .LBB61_12: # %else10 +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB61_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB61_16 +; RV64ZVE32F-NEXT: .LBB61_14: # %else14 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB61_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma @@ -6889,7 +6748,7 @@ define void @mscatter_baseidx_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i16> %id ; RV64ZVE32F-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB61_11 +; RV64ZVE32F-NEXT: beqz a1, .LBB61_14 ; RV64ZVE32F-NEXT: .LBB61_16: # %cond.store13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 @@ -7451,71 +7310,64 @@ define void @mscatter_baseidx_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i8> %i ; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v8, 1 ; RV64ZVE32F-ZVFH-NEXT: vse16.v v10, (a2) ; RV64ZVE32F-ZVFH-NEXT: .LBB68_4: # %else2 -; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v9, 4 ; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 4 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v9, 2 -; RV64ZVE32F-ZVFH-NEXT: bnez a2, .LBB68_12 -; RV64ZVE32F-ZVFH-NEXT: # %bb.5: # %else4 -; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 8 -; RV64ZVE32F-ZVFH-NEXT: bnez a2, .LBB68_13 -; RV64ZVE32F-ZVFH-NEXT: .LBB68_6: # %else6 -; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 16 -; RV64ZVE32F-ZVFH-NEXT: bnez a2, .LBB68_14 -; RV64ZVE32F-ZVFH-NEXT: .LBB68_7: # %else8 -; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 32 -; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB68_9 -; RV64ZVE32F-ZVFH-NEXT: .LBB68_8: # %cond.store9 -; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v10, 1 -; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFH-NEXT: add a2, a0, a2 -; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v8, 5 -; RV64ZVE32F-ZVFH-NEXT: vse16.v v9, (a2) -; RV64ZVE32F-ZVFH-NEXT: .LBB68_9: # %else10 -; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 64 -; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v10, 2 -; RV64ZVE32F-ZVFH-NEXT: bnez a2, .LBB68_15 -; RV64ZVE32F-ZVFH-NEXT: # %bb.10: # %else12 -; RV64ZVE32F-ZVFH-NEXT: andi a1, a1, -128 -; RV64ZVE32F-ZVFH-NEXT: bnez a1, .LBB68_16 -; RV64ZVE32F-ZVFH-NEXT: .LBB68_11: # %else14 -; RV64ZVE32F-ZVFH-NEXT: ret -; RV64ZVE32F-ZVFH-NEXT: .LBB68_12: # %cond.store3 -; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v9, 2 +; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB68_6 +; RV64ZVE32F-ZVFH-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 ; RV64ZVE32F-ZVFH-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v11, v8, 2 ; RV64ZVE32F-ZVFH-NEXT: vse16.v v11, (a2) +; RV64ZVE32F-ZVFH-NEXT: .LBB68_6: # %else4 ; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 8 -; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB68_6 -; RV64ZVE32F-ZVFH-NEXT: .LBB68_13: # %cond.store5 +; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB68_8 +; RV64ZVE32F-ZVFH-NEXT: # %bb.7: # %cond.store5 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 ; RV64ZVE32F-ZVFH-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v8, 3 -; RV64ZVE32F-ZVFH-NEXT: vse16.v v9, (a2) +; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v8, 3 +; RV64ZVE32F-ZVFH-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-ZVFH-NEXT: .LBB68_8: # %else6 ; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 16 -; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB68_7 -; RV64ZVE32F-ZVFH-NEXT: .LBB68_14: # %cond.store7 -; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v9, 4 +; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB68_10 +; RV64ZVE32F-ZVFH-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 ; RV64ZVE32F-ZVFH-NEXT: add a2, a0, a2 +; RV64ZVE32F-ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v8, 4 -; RV64ZVE32F-ZVFH-NEXT: vse16.v v9, (a2) +; RV64ZVE32F-ZVFH-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-ZVFH-NEXT: .LBB68_10: # %else8 ; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 32 -; RV64ZVE32F-ZVFH-NEXT: bnez a2, .LBB68_8 -; RV64ZVE32F-ZVFH-NEXT: j .LBB68_9 +; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB68_12 +; RV64ZVE32F-ZVFH-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 +; RV64ZVE32F-ZVFH-NEXT: add a2, a0, a2 +; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v8, 5 +; RV64ZVE32F-ZVFH-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-ZVFH-NEXT: .LBB68_12: # %else10 +; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 64 +; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-ZVFH-NEXT: bnez a2, .LBB68_15 +; RV64ZVE32F-ZVFH-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-ZVFH-NEXT: andi a1, a1, -128 +; RV64ZVE32F-ZVFH-NEXT: bnez a1, .LBB68_16 +; RV64ZVE32F-ZVFH-NEXT: .LBB68_14: # %else14 +; RV64ZVE32F-ZVFH-NEXT: ret ; RV64ZVE32F-ZVFH-NEXT: .LBB68_15: # %cond.store11 ; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 @@ -7524,7 +7376,7 @@ define void @mscatter_baseidx_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i8> %i ; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v8, 6 ; RV64ZVE32F-ZVFH-NEXT: vse16.v v10, (a2) ; RV64ZVE32F-ZVFH-NEXT: andi a1, a1, -128 -; RV64ZVE32F-ZVFH-NEXT: beqz a1, .LBB68_11 +; RV64ZVE32F-ZVFH-NEXT: beqz a1, .LBB68_14 ; RV64ZVE32F-ZVFH-NEXT: .LBB68_16: # %cond.store13 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v9, 1 @@ -7565,44 +7417,12 @@ define void @mscatter_baseidx_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i8> %i ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB68_4: # %else2 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v9, 4 ; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 4 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v9, 2 -; RV64ZVE32F-ZVFHMIN-NEXT: bnez a2, .LBB68_12 -; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.5: # %else4 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 8 -; RV64ZVE32F-ZVFHMIN-NEXT: bnez a2, .LBB68_13 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB68_6: # %else6 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 16 -; RV64ZVE32F-ZVFHMIN-NEXT: bnez a2, .LBB68_14 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB68_7: # %else8 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 32 -; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB68_9 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB68_8: # %cond.store9 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v10, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v8, 5 -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v9 -; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 -; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 -; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB68_9: # %else10 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 64 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v10, 2 -; RV64ZVE32F-ZVFHMIN-NEXT: bnez a2, .LBB68_15 -; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.10: # %else12 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a1, a1, -128 -; RV64ZVE32F-ZVFHMIN-NEXT: bnez a1, .LBB68_16 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB68_11: # %else14 -; RV64ZVE32F-ZVFHMIN-NEXT: ret -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB68_12: # %cond.store3 -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v9, 2 +; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB68_6 +; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v11, v8, 2 ; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 @@ -7610,34 +7430,58 @@ define void @mscatter_baseidx_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i8> %i ; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) +; RV64ZVE32F-ZVFHMIN-NEXT: .LBB68_6: # %else4 ; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 8 -; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB68_6 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB68_13: # %cond.store5 +; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB68_8 +; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.7: # %cond.store5 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v8, 3 -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v8, 3 +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v10 ; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) +; RV64ZVE32F-ZVFHMIN-NEXT: .LBB68_8: # %else6 ; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 16 -; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB68_7 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB68_14: # %cond.store7 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v9, 4 +; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB68_10 +; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v8, 4 +; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 +; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 +; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) +; RV64ZVE32F-ZVFHMIN-NEXT: .LBB68_10: # %else8 +; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 32 +; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB68_12 +; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v9, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v8, 5 +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v10 ; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v9 ; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 32 -; RV64ZVE32F-ZVFHMIN-NEXT: bnez a2, .LBB68_8 -; RV64ZVE32F-ZVFHMIN-NEXT: j .LBB68_9 +; RV64ZVE32F-ZVFHMIN-NEXT: .LBB68_12: # %else10 +; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 64 +; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-ZVFHMIN-NEXT: bnez a2, .LBB68_15 +; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-ZVFHMIN-NEXT: andi a1, a1, -128 +; RV64ZVE32F-ZVFHMIN-NEXT: bnez a1, .LBB68_16 +; RV64ZVE32F-ZVFHMIN-NEXT: .LBB68_14: # %else14 +; RV64ZVE32F-ZVFHMIN-NEXT: ret ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB68_15: # %cond.store11 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma @@ -7648,7 +7492,7 @@ define void @mscatter_baseidx_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i8> %i ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-ZVFHMIN-NEXT: andi a1, a1, -128 -; RV64ZVE32F-ZVFHMIN-NEXT: beqz a1, .LBB68_11 +; RV64ZVE32F-ZVFHMIN-NEXT: beqz a1, .LBB68_14 ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB68_16: # %cond.store13 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v9, 1 @@ -7712,71 +7556,64 @@ define void @mscatter_baseidx_sext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i ; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v8, 1 ; RV64ZVE32F-ZVFH-NEXT: vse16.v v10, (a2) ; RV64ZVE32F-ZVFH-NEXT: .LBB69_4: # %else2 -; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v9, 4 ; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 4 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v9, 2 -; RV64ZVE32F-ZVFH-NEXT: bnez a2, .LBB69_12 -; RV64ZVE32F-ZVFH-NEXT: # %bb.5: # %else4 -; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 8 -; RV64ZVE32F-ZVFH-NEXT: bnez a2, .LBB69_13 -; RV64ZVE32F-ZVFH-NEXT: .LBB69_6: # %else6 -; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 16 -; RV64ZVE32F-ZVFH-NEXT: bnez a2, .LBB69_14 -; RV64ZVE32F-ZVFH-NEXT: .LBB69_7: # %else8 -; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 32 -; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB69_9 -; RV64ZVE32F-ZVFH-NEXT: .LBB69_8: # %cond.store9 -; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v10, 1 -; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFH-NEXT: add a2, a0, a2 -; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v8, 5 -; RV64ZVE32F-ZVFH-NEXT: vse16.v v9, (a2) -; RV64ZVE32F-ZVFH-NEXT: .LBB69_9: # %else10 -; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 64 -; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v10, 2 -; RV64ZVE32F-ZVFH-NEXT: bnez a2, .LBB69_15 -; RV64ZVE32F-ZVFH-NEXT: # %bb.10: # %else12 -; RV64ZVE32F-ZVFH-NEXT: andi a1, a1, -128 -; RV64ZVE32F-ZVFH-NEXT: bnez a1, .LBB69_16 -; RV64ZVE32F-ZVFH-NEXT: .LBB69_11: # %else14 -; RV64ZVE32F-ZVFH-NEXT: ret -; RV64ZVE32F-ZVFH-NEXT: .LBB69_12: # %cond.store3 -; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v9, 2 +; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB69_6 +; RV64ZVE32F-ZVFH-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 ; RV64ZVE32F-ZVFH-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v11, v8, 2 ; RV64ZVE32F-ZVFH-NEXT: vse16.v v11, (a2) +; RV64ZVE32F-ZVFH-NEXT: .LBB69_6: # %else4 ; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 8 -; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB69_6 -; RV64ZVE32F-ZVFH-NEXT: .LBB69_13: # %cond.store5 +; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB69_8 +; RV64ZVE32F-ZVFH-NEXT: # %bb.7: # %cond.store5 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 ; RV64ZVE32F-ZVFH-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v8, 3 -; RV64ZVE32F-ZVFH-NEXT: vse16.v v9, (a2) +; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v8, 3 +; RV64ZVE32F-ZVFH-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-ZVFH-NEXT: .LBB69_8: # %else6 ; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 16 -; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB69_7 -; RV64ZVE32F-ZVFH-NEXT: .LBB69_14: # %cond.store7 -; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v9, 4 +; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB69_10 +; RV64ZVE32F-ZVFH-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 ; RV64ZVE32F-ZVFH-NEXT: add a2, a0, a2 +; RV64ZVE32F-ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v8, 4 -; RV64ZVE32F-ZVFH-NEXT: vse16.v v9, (a2) +; RV64ZVE32F-ZVFH-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-ZVFH-NEXT: .LBB69_10: # %else8 ; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 32 -; RV64ZVE32F-ZVFH-NEXT: bnez a2, .LBB69_8 -; RV64ZVE32F-ZVFH-NEXT: j .LBB69_9 +; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB69_12 +; RV64ZVE32F-ZVFH-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 +; RV64ZVE32F-ZVFH-NEXT: add a2, a0, a2 +; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v8, 5 +; RV64ZVE32F-ZVFH-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-ZVFH-NEXT: .LBB69_12: # %else10 +; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 64 +; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-ZVFH-NEXT: bnez a2, .LBB69_15 +; RV64ZVE32F-ZVFH-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-ZVFH-NEXT: andi a1, a1, -128 +; RV64ZVE32F-ZVFH-NEXT: bnez a1, .LBB69_16 +; RV64ZVE32F-ZVFH-NEXT: .LBB69_14: # %else14 +; RV64ZVE32F-ZVFH-NEXT: ret ; RV64ZVE32F-ZVFH-NEXT: .LBB69_15: # %cond.store11 ; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 @@ -7785,7 +7622,7 @@ define void @mscatter_baseidx_sext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i ; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v8, 6 ; RV64ZVE32F-ZVFH-NEXT: vse16.v v10, (a2) ; RV64ZVE32F-ZVFH-NEXT: andi a1, a1, -128 -; RV64ZVE32F-ZVFH-NEXT: beqz a1, .LBB69_11 +; RV64ZVE32F-ZVFH-NEXT: beqz a1, .LBB69_14 ; RV64ZVE32F-ZVFH-NEXT: .LBB69_16: # %cond.store13 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v9, 1 @@ -7826,44 +7663,12 @@ define void @mscatter_baseidx_sext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB69_4: # %else2 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v9, 4 ; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 4 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v9, 2 -; RV64ZVE32F-ZVFHMIN-NEXT: bnez a2, .LBB69_12 -; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.5: # %else4 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 8 -; RV64ZVE32F-ZVFHMIN-NEXT: bnez a2, .LBB69_13 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB69_6: # %else6 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 16 -; RV64ZVE32F-ZVFHMIN-NEXT: bnez a2, .LBB69_14 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB69_7: # %else8 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 32 -; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB69_9 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB69_8: # %cond.store9 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v10, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v8, 5 -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v9 -; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 -; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 -; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB69_9: # %else10 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 64 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v10, 2 -; RV64ZVE32F-ZVFHMIN-NEXT: bnez a2, .LBB69_15 -; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.10: # %else12 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a1, a1, -128 -; RV64ZVE32F-ZVFHMIN-NEXT: bnez a1, .LBB69_16 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB69_11: # %else14 -; RV64ZVE32F-ZVFHMIN-NEXT: ret -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB69_12: # %cond.store3 -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v9, 2 +; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB69_6 +; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v11, v8, 2 ; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 @@ -7871,34 +7676,58 @@ define void @mscatter_baseidx_sext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i ; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) +; RV64ZVE32F-ZVFHMIN-NEXT: .LBB69_6: # %else4 ; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 8 -; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB69_6 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB69_13: # %cond.store5 +; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB69_8 +; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.7: # %cond.store5 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v8, 3 -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v8, 3 +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v10 ; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) +; RV64ZVE32F-ZVFHMIN-NEXT: .LBB69_8: # %else6 ; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 16 -; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB69_7 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB69_14: # %cond.store7 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v9, 4 +; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB69_10 +; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v8, 4 +; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 +; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 +; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) +; RV64ZVE32F-ZVFHMIN-NEXT: .LBB69_10: # %else8 +; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 32 +; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB69_12 +; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v9, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v8, 5 +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v10 ; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v9 ; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 32 -; RV64ZVE32F-ZVFHMIN-NEXT: bnez a2, .LBB69_8 -; RV64ZVE32F-ZVFHMIN-NEXT: j .LBB69_9 +; RV64ZVE32F-ZVFHMIN-NEXT: .LBB69_12: # %else10 +; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 64 +; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-ZVFHMIN-NEXT: bnez a2, .LBB69_15 +; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-ZVFHMIN-NEXT: andi a1, a1, -128 +; RV64ZVE32F-ZVFHMIN-NEXT: bnez a1, .LBB69_16 +; RV64ZVE32F-ZVFHMIN-NEXT: .LBB69_14: # %else14 +; RV64ZVE32F-ZVFHMIN-NEXT: ret ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB69_15: # %cond.store11 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma @@ -7909,7 +7738,7 @@ define void @mscatter_baseidx_sext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-ZVFHMIN-NEXT: andi a1, a1, -128 -; RV64ZVE32F-ZVFHMIN-NEXT: beqz a1, .LBB69_11 +; RV64ZVE32F-ZVFHMIN-NEXT: beqz a1, .LBB69_14 ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB69_16: # %cond.store13 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v9, 1 @@ -7974,75 +7803,68 @@ define void @mscatter_baseidx_zext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i ; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v8, 1 ; RV64ZVE32F-ZVFH-NEXT: vse16.v v10, (a2) ; RV64ZVE32F-ZVFH-NEXT: .LBB70_4: # %else2 -; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v9, 4 ; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 4 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v9, 2 -; RV64ZVE32F-ZVFH-NEXT: bnez a2, .LBB70_12 -; RV64ZVE32F-ZVFH-NEXT: # %bb.5: # %else4 -; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 8 -; RV64ZVE32F-ZVFH-NEXT: bnez a2, .LBB70_13 -; RV64ZVE32F-ZVFH-NEXT: .LBB70_6: # %else6 -; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 16 -; RV64ZVE32F-ZVFH-NEXT: bnez a2, .LBB70_14 -; RV64ZVE32F-ZVFH-NEXT: .LBB70_7: # %else8 -; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 32 -; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB70_9 -; RV64ZVE32F-ZVFH-NEXT: .LBB70_8: # %cond.store9 -; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v10, 1 -; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-ZVFH-NEXT: zext.b a2, a2 -; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFH-NEXT: add a2, a0, a2 -; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v8, 5 -; RV64ZVE32F-ZVFH-NEXT: vse16.v v9, (a2) -; RV64ZVE32F-ZVFH-NEXT: .LBB70_9: # %else10 -; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 64 -; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v10, 2 -; RV64ZVE32F-ZVFH-NEXT: bnez a2, .LBB70_15 -; RV64ZVE32F-ZVFH-NEXT: # %bb.10: # %else12 -; RV64ZVE32F-ZVFH-NEXT: andi a1, a1, -128 -; RV64ZVE32F-ZVFH-NEXT: bnez a1, .LBB70_16 -; RV64ZVE32F-ZVFH-NEXT: .LBB70_11: # %else14 -; RV64ZVE32F-ZVFH-NEXT: ret -; RV64ZVE32F-ZVFH-NEXT: .LBB70_12: # %cond.store3 -; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v9, 2 +; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB70_6 +; RV64ZVE32F-ZVFH-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-ZVFH-NEXT: zext.b a2, a2 ; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 ; RV64ZVE32F-ZVFH-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v11, v8, 2 ; RV64ZVE32F-ZVFH-NEXT: vse16.v v11, (a2) +; RV64ZVE32F-ZVFH-NEXT: .LBB70_6: # %else4 ; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 8 -; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB70_6 -; RV64ZVE32F-ZVFH-NEXT: .LBB70_13: # %cond.store5 +; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB70_8 +; RV64ZVE32F-ZVFH-NEXT: # %bb.7: # %cond.store5 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-ZVFH-NEXT: zext.b a2, a2 ; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 ; RV64ZVE32F-ZVFH-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v8, 3 -; RV64ZVE32F-ZVFH-NEXT: vse16.v v9, (a2) +; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v8, 3 +; RV64ZVE32F-ZVFH-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-ZVFH-NEXT: .LBB70_8: # %else6 ; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 16 -; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB70_7 -; RV64ZVE32F-ZVFH-NEXT: .LBB70_14: # %cond.store7 -; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v9, 4 +; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB70_10 +; RV64ZVE32F-ZVFH-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-ZVFH-NEXT: zext.b a2, a2 ; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 ; RV64ZVE32F-ZVFH-NEXT: add a2, a0, a2 +; RV64ZVE32F-ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v8, 4 -; RV64ZVE32F-ZVFH-NEXT: vse16.v v9, (a2) +; RV64ZVE32F-ZVFH-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-ZVFH-NEXT: .LBB70_10: # %else8 ; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 32 -; RV64ZVE32F-ZVFH-NEXT: bnez a2, .LBB70_8 -; RV64ZVE32F-ZVFH-NEXT: j .LBB70_9 +; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB70_12 +; RV64ZVE32F-ZVFH-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-ZVFH-NEXT: zext.b a2, a2 +; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 +; RV64ZVE32F-ZVFH-NEXT: add a2, a0, a2 +; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v8, 5 +; RV64ZVE32F-ZVFH-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-ZVFH-NEXT: .LBB70_12: # %else10 +; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 64 +; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-ZVFH-NEXT: bnez a2, .LBB70_15 +; RV64ZVE32F-ZVFH-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-ZVFH-NEXT: andi a1, a1, -128 +; RV64ZVE32F-ZVFH-NEXT: bnez a1, .LBB70_16 +; RV64ZVE32F-ZVFH-NEXT: .LBB70_14: # %else14 +; RV64ZVE32F-ZVFH-NEXT: ret ; RV64ZVE32F-ZVFH-NEXT: .LBB70_15: # %cond.store11 ; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-ZVFH-NEXT: zext.b a2, a2 @@ -8052,7 +7874,7 @@ define void @mscatter_baseidx_zext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i ; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v8, 6 ; RV64ZVE32F-ZVFH-NEXT: vse16.v v10, (a2) ; RV64ZVE32F-ZVFH-NEXT: andi a1, a1, -128 -; RV64ZVE32F-ZVFH-NEXT: beqz a1, .LBB70_11 +; RV64ZVE32F-ZVFH-NEXT: beqz a1, .LBB70_14 ; RV64ZVE32F-ZVFH-NEXT: .LBB70_16: # %cond.store13 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v9, 1 @@ -8096,83 +7918,75 @@ define void @mscatter_baseidx_zext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB70_4: # %else2 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v9, 4 ; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 4 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v9, 2 -; RV64ZVE32F-ZVFHMIN-NEXT: bnez a2, .LBB70_12 -; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.5: # %else4 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 8 -; RV64ZVE32F-ZVFHMIN-NEXT: bnez a2, .LBB70_13 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB70_6: # %else6 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 16 -; RV64ZVE32F-ZVFHMIN-NEXT: bnez a2, .LBB70_14 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB70_7: # %else8 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 32 -; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB70_9 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB70_8: # %cond.store9 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v10, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v9, 2 +; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB70_6 +; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v8, 5 -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v11, v8, 2 ; RV64ZVE32F-ZVFHMIN-NEXT: zext.b a2, a2 +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v11 ; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB70_9: # %else10 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 64 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v10, 2 -; RV64ZVE32F-ZVFHMIN-NEXT: bnez a2, .LBB70_15 -; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.10: # %else12 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a1, a1, -128 -; RV64ZVE32F-ZVFHMIN-NEXT: bnez a1, .LBB70_16 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB70_11: # %else14 -; RV64ZVE32F-ZVFHMIN-NEXT: ret -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB70_12: # %cond.store3 -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-ZVFHMIN-NEXT: .LBB70_6: # %else4 +; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 8 +; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB70_8 +; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.7: # %cond.store5 +; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v11, v8, 2 +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v8, 3 +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v10 ; RV64ZVE32F-ZVFHMIN-NEXT: zext.b a2, a2 -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v11 ; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 8 -; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB70_6 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB70_13: # %cond.store5 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-ZVFHMIN-NEXT: .LBB70_8: # %else6 +; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 16 +; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v9, 4 +; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB70_10 +; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.9: # %cond.store7 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v8, 3 -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-ZVFHMIN-NEXT: zext.b a2, a2 +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v10 ; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 16 -; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB70_7 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB70_14: # %cond.store7 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64ZVE32F-ZVFHMIN-NEXT: .LBB70_10: # %else8 +; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 32 +; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB70_12 +; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v9, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v8, 5 +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v10 ; RV64ZVE32F-ZVFHMIN-NEXT: zext.b a2, a2 -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v9 ; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 32 -; RV64ZVE32F-ZVFHMIN-NEXT: bnez a2, .LBB70_8 -; RV64ZVE32F-ZVFHMIN-NEXT: j .LBB70_9 +; RV64ZVE32F-ZVFHMIN-NEXT: .LBB70_12: # %else10 +; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 64 +; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-ZVFHMIN-NEXT: bnez a2, .LBB70_15 +; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-ZVFHMIN-NEXT: andi a1, a1, -128 +; RV64ZVE32F-ZVFHMIN-NEXT: bnez a1, .LBB70_16 +; RV64ZVE32F-ZVFHMIN-NEXT: .LBB70_14: # %else14 +; RV64ZVE32F-ZVFHMIN-NEXT: ret ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB70_15: # %cond.store11 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma @@ -8184,7 +7998,7 @@ define void @mscatter_baseidx_zext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-ZVFHMIN-NEXT: andi a1, a1, -128 -; RV64ZVE32F-ZVFHMIN-NEXT: beqz a1, .LBB70_11 +; RV64ZVE32F-ZVFHMIN-NEXT: beqz a1, .LBB70_14 ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB70_16: # %cond.store13 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v9, 1 @@ -8248,70 +8062,63 @@ define void @mscatter_baseidx_v8f16(<8 x half> %val, ptr %base, <8 x i16> %idxs, ; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v8, 1 ; RV64ZVE32F-ZVFH-NEXT: vse16.v v10, (a2) ; RV64ZVE32F-ZVFH-NEXT: .LBB71_4: # %else2 -; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v9, 4 ; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 4 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v9, 2 -; RV64ZVE32F-ZVFH-NEXT: bnez a2, .LBB71_12 -; RV64ZVE32F-ZVFH-NEXT: # %bb.5: # %else4 -; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 8 -; RV64ZVE32F-ZVFH-NEXT: bnez a2, .LBB71_13 -; RV64ZVE32F-ZVFH-NEXT: .LBB71_6: # %else6 -; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 16 -; RV64ZVE32F-ZVFH-NEXT: bnez a2, .LBB71_14 -; RV64ZVE32F-ZVFH-NEXT: .LBB71_7: # %else8 -; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 32 -; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB71_9 -; RV64ZVE32F-ZVFH-NEXT: .LBB71_8: # %cond.store9 -; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v10, 1 -; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFH-NEXT: add a2, a0, a2 -; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v8, 5 -; RV64ZVE32F-ZVFH-NEXT: vse16.v v9, (a2) -; RV64ZVE32F-ZVFH-NEXT: .LBB71_9: # %else10 -; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 64 -; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v10, 2 -; RV64ZVE32F-ZVFH-NEXT: bnez a2, .LBB71_15 -; RV64ZVE32F-ZVFH-NEXT: # %bb.10: # %else12 -; RV64ZVE32F-ZVFH-NEXT: andi a1, a1, -128 -; RV64ZVE32F-ZVFH-NEXT: bnez a1, .LBB71_16 -; RV64ZVE32F-ZVFH-NEXT: .LBB71_11: # %else14 -; RV64ZVE32F-ZVFH-NEXT: ret -; RV64ZVE32F-ZVFH-NEXT: .LBB71_12: # %cond.store3 -; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v9, 2 +; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB71_6 +; RV64ZVE32F-ZVFH-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 ; RV64ZVE32F-ZVFH-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v11, v8, 2 ; RV64ZVE32F-ZVFH-NEXT: vse16.v v11, (a2) +; RV64ZVE32F-ZVFH-NEXT: .LBB71_6: # %else4 ; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 8 -; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB71_6 -; RV64ZVE32F-ZVFH-NEXT: .LBB71_13: # %cond.store5 +; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB71_8 +; RV64ZVE32F-ZVFH-NEXT: # %bb.7: # %cond.store5 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 ; RV64ZVE32F-ZVFH-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v8, 3 -; RV64ZVE32F-ZVFH-NEXT: vse16.v v9, (a2) +; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v8, 3 +; RV64ZVE32F-ZVFH-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-ZVFH-NEXT: .LBB71_8: # %else6 ; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 16 -; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB71_7 -; RV64ZVE32F-ZVFH-NEXT: .LBB71_14: # %cond.store7 +; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v9, 4 +; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB71_10 +; RV64ZVE32F-ZVFH-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 +; RV64ZVE32F-ZVFH-NEXT: add a2, a0, a2 +; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV64ZVE32F-ZVFH-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-ZVFH-NEXT: .LBB71_10: # %else8 +; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 32 +; RV64ZVE32F-ZVFH-NEXT: beqz a2, .LBB71_12 +; RV64ZVE32F-ZVFH-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e16, mf2, ta, ma +; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v9, 1 ; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 ; RV64ZVE32F-ZVFH-NEXT: add a2, a0, a2 -; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v8, 4 -; RV64ZVE32F-ZVFH-NEXT: vse16.v v9, (a2) -; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 32 -; RV64ZVE32F-ZVFH-NEXT: bnez a2, .LBB71_8 -; RV64ZVE32F-ZVFH-NEXT: j .LBB71_9 +; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v8, 5 +; RV64ZVE32F-ZVFH-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-ZVFH-NEXT: .LBB71_12: # %else10 +; RV64ZVE32F-ZVFH-NEXT: andi a2, a1, 64 +; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-ZVFH-NEXT: bnez a2, .LBB71_15 +; RV64ZVE32F-ZVFH-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-ZVFH-NEXT: andi a1, a1, -128 +; RV64ZVE32F-ZVFH-NEXT: bnez a1, .LBB71_16 +; RV64ZVE32F-ZVFH-NEXT: .LBB71_14: # %else14 +; RV64ZVE32F-ZVFH-NEXT: ret ; RV64ZVE32F-ZVFH-NEXT: .LBB71_15: # %cond.store11 ; RV64ZVE32F-ZVFH-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-ZVFH-NEXT: slli a2, a2, 1 @@ -8320,7 +8127,7 @@ define void @mscatter_baseidx_v8f16(<8 x half> %val, ptr %base, <8 x i16> %idxs, ; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v10, v8, 6 ; RV64ZVE32F-ZVFH-NEXT: vse16.v v10, (a2) ; RV64ZVE32F-ZVFH-NEXT: andi a1, a1, -128 -; RV64ZVE32F-ZVFH-NEXT: beqz a1, .LBB71_11 +; RV64ZVE32F-ZVFH-NEXT: beqz a1, .LBB71_14 ; RV64ZVE32F-ZVFH-NEXT: .LBB71_16: # %cond.store13 ; RV64ZVE32F-ZVFH-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64ZVE32F-ZVFH-NEXT: vslidedown.vi v9, v9, 1 @@ -8361,44 +8168,12 @@ define void @mscatter_baseidx_v8f16(<8 x half> %val, ptr %base, <8 x i16> %idxs, ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB71_4: # %else2 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v9, 4 ; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 4 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v9, 2 -; RV64ZVE32F-ZVFHMIN-NEXT: bnez a2, .LBB71_12 -; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.5: # %else4 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 8 -; RV64ZVE32F-ZVFHMIN-NEXT: bnez a2, .LBB71_13 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB71_6: # %else6 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 16 -; RV64ZVE32F-ZVFHMIN-NEXT: bnez a2, .LBB71_14 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB71_7: # %else8 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 32 -; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB71_9 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB71_8: # %cond.store9 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v10, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v8, 5 -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v9 -; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 -; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 -; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB71_9: # %else10 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 64 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v10, 2 -; RV64ZVE32F-ZVFHMIN-NEXT: bnez a2, .LBB71_15 -; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.10: # %else12 -; RV64ZVE32F-ZVFHMIN-NEXT: andi a1, a1, -128 -; RV64ZVE32F-ZVFHMIN-NEXT: bnez a1, .LBB71_16 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB71_11: # %else14 -; RV64ZVE32F-ZVFHMIN-NEXT: ret -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB71_12: # %cond.store3 -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v9, 2 +; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB71_6 +; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v11, v8, 2 ; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 @@ -8406,33 +8181,57 @@ define void @mscatter_baseidx_v8f16(<8 x half> %val, ptr %base, <8 x i16> %idxs, ; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) +; RV64ZVE32F-ZVFHMIN-NEXT: .LBB71_6: # %else4 ; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 8 -; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB71_6 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB71_13: # %cond.store5 +; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB71_8 +; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.7: # %cond.store5 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v8, 3 -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v8, 3 +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v10 ; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 ; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) +; RV64ZVE32F-ZVFHMIN-NEXT: .LBB71_8: # %else6 ; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 16 -; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB71_7 -; RV64ZVE32F-ZVFHMIN-NEXT: .LBB71_14: # %cond.store7 -; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v9, 4 +; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB71_10 +; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 -; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v10 ; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) +; RV64ZVE32F-ZVFHMIN-NEXT: .LBB71_10: # %else8 ; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 32 -; RV64ZVE32F-ZVFHMIN-NEXT: bnez a2, .LBB71_8 -; RV64ZVE32F-ZVFHMIN-NEXT: j .LBB71_9 +; RV64ZVE32F-ZVFHMIN-NEXT: beqz a2, .LBB71_12 +; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, mf2, ta, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v10, v8, 5 +; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-ZVFHMIN-NEXT: slli a2, a2, 1 +; RV64ZVE32F-ZVFHMIN-NEXT: add a2, a0, a2 +; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 +; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) +; RV64ZVE32F-ZVFHMIN-NEXT: .LBB71_12: # %else10 +; RV64ZVE32F-ZVFHMIN-NEXT: andi a2, a1, 64 +; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-ZVFHMIN-NEXT: bnez a2, .LBB71_15 +; RV64ZVE32F-ZVFHMIN-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-ZVFHMIN-NEXT: andi a1, a1, -128 +; RV64ZVE32F-ZVFHMIN-NEXT: bnez a1, .LBB71_16 +; RV64ZVE32F-ZVFHMIN-NEXT: .LBB71_14: # %else14 +; RV64ZVE32F-ZVFHMIN-NEXT: ret ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB71_15: # %cond.store11 ; RV64ZVE32F-ZVFHMIN-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma @@ -8443,7 +8242,7 @@ define void @mscatter_baseidx_v8f16(<8 x half> %val, ptr %base, <8 x i16> %idxs, ; RV64ZVE32F-ZVFHMIN-NEXT: fmv.h.x fa5, a3 ; RV64ZVE32F-ZVFHMIN-NEXT: fsh fa5, 0(a2) ; RV64ZVE32F-ZVFHMIN-NEXT: andi a1, a1, -128 -; RV64ZVE32F-ZVFHMIN-NEXT: beqz a1, .LBB71_11 +; RV64ZVE32F-ZVFHMIN-NEXT: beqz a1, .LBB71_14 ; RV64ZVE32F-ZVFHMIN-NEXT: .LBB71_16: # %cond.store13 ; RV64ZVE32F-ZVFHMIN-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64ZVE32F-ZVFHMIN-NEXT: vslidedown.vi v9, v9, 1 @@ -8793,74 +8592,66 @@ define void @mscatter_baseidx_v8i8_v8f32(<8 x float> %val, ptr %base, <8 x i8> % ; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 1 ; RV64ZVE32F-NEXT: vse32.v v11, (a2) ; RV64ZVE32F-NEXT: .LBB78_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB78_12 -; RV64ZVE32F-NEXT: # %bb.5: # %else4 -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB78_13 -; RV64ZVE32F-NEXT: .LBB78_6: # %else6 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB78_14 -; RV64ZVE32F-NEXT: .LBB78_7: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB78_9 -; RV64ZVE32F-NEXT: .LBB78_8: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vse32.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB78_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB78_15 -; RV64ZVE32F-NEXT: # %bb.10: # %else12 -; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: bnez a1, .LBB78_16 -; RV64ZVE32F-NEXT: .LBB78_11: # %else14 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB78_12: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB78_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB78_6: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB78_6 -; RV64ZVE32F-NEXT: .LBB78_13: # %cond.store5 +; RV64ZVE32F-NEXT: beqz a2, .LBB78_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 -; RV64ZVE32F-NEXT: vse32.v v10, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 3 +; RV64ZVE32F-NEXT: vse32.v v11, (a2) +; RV64ZVE32F-NEXT: .LBB78_8: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB78_7 -; RV64ZVE32F-NEXT: .LBB78_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB78_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB78_10: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB78_8 -; RV64ZVE32F-NEXT: j .LBB78_9 +; RV64ZVE32F-NEXT: beqz a2, .LBB78_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB78_12: # %else10 +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB78_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB78_16 +; RV64ZVE32F-NEXT: .LBB78_14: # %else14 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB78_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 @@ -8870,7 +8661,7 @@ define void @mscatter_baseidx_v8i8_v8f32(<8 x float> %val, ptr %base, <8 x i8> % ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB78_11 +; RV64ZVE32F-NEXT: beqz a1, .LBB78_14 ; RV64ZVE32F-NEXT: .LBB78_16: # %cond.store13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 @@ -8930,74 +8721,66 @@ define void @mscatter_baseidx_sext_v8i8_v8f32(<8 x float> %val, ptr %base, <8 x ; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 1 ; RV64ZVE32F-NEXT: vse32.v v11, (a2) ; RV64ZVE32F-NEXT: .LBB79_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB79_12 -; RV64ZVE32F-NEXT: # %bb.5: # %else4 -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB79_13 -; RV64ZVE32F-NEXT: .LBB79_6: # %else6 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB79_14 -; RV64ZVE32F-NEXT: .LBB79_7: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB79_9 -; RV64ZVE32F-NEXT: .LBB79_8: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vse32.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB79_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB79_15 -; RV64ZVE32F-NEXT: # %bb.10: # %else12 -; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: bnez a1, .LBB79_16 -; RV64ZVE32F-NEXT: .LBB79_11: # %else14 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB79_12: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB79_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB79_6: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB79_6 -; RV64ZVE32F-NEXT: .LBB79_13: # %cond.store5 +; RV64ZVE32F-NEXT: beqz a2, .LBB79_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 -; RV64ZVE32F-NEXT: vse32.v v10, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 3 +; RV64ZVE32F-NEXT: vse32.v v11, (a2) +; RV64ZVE32F-NEXT: .LBB79_8: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB79_7 -; RV64ZVE32F-NEXT: .LBB79_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB79_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB79_10: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB79_8 -; RV64ZVE32F-NEXT: j .LBB79_9 +; RV64ZVE32F-NEXT: beqz a2, .LBB79_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB79_12: # %else10 +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB79_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB79_16 +; RV64ZVE32F-NEXT: .LBB79_14: # %else14 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB79_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 @@ -9007,7 +8790,7 @@ define void @mscatter_baseidx_sext_v8i8_v8f32(<8 x float> %val, ptr %base, <8 x ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB79_11 +; RV64ZVE32F-NEXT: beqz a1, .LBB79_14 ; RV64ZVE32F-NEXT: .LBB79_16: # %cond.store13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 @@ -9071,78 +8854,70 @@ define void @mscatter_baseidx_zext_v8i8_v8f32(<8 x float> %val, ptr %base, <8 x ; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 1 ; RV64ZVE32F-NEXT: vse32.v v11, (a2) ; RV64ZVE32F-NEXT: .LBB80_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB80_12 -; RV64ZVE32F-NEXT: # %bb.5: # %else4 -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB80_13 -; RV64ZVE32F-NEXT: .LBB80_6: # %else6 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB80_14 -; RV64ZVE32F-NEXT: .LBB80_7: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB80_9 -; RV64ZVE32F-NEXT: .LBB80_8: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB80_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: zext.b a2, a2 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB80_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB80_15 -; RV64ZVE32F-NEXT: # %bb.10: # %else12 -; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: bnez a1, .LBB80_16 -; RV64ZVE32F-NEXT: .LBB80_11: # %else14 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB80_12: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: .LBB80_6: # %else4 +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB80_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: zext.b a2, a2 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vse32.v v12, (a2) -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB80_6 -; RV64ZVE32F-NEXT: .LBB80_13: # %cond.store5 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 3 +; RV64ZVE32F-NEXT: vse32.v v11, (a2) +; RV64ZVE32F-NEXT: .LBB80_8: # %else6 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB80_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: zext.b a2, a2 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 -; RV64ZVE32F-NEXT: vse32.v v10, (a2) -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB80_7 -; RV64ZVE32F-NEXT: .LBB80_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB80_10: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB80_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: zext.b a2, a2 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB80_8 -; RV64ZVE32F-NEXT: j .LBB80_9 +; RV64ZVE32F-NEXT: .LBB80_12: # %else10 +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB80_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB80_16 +; RV64ZVE32F-NEXT: .LBB80_14: # %else14 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB80_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: zext.b a2, a2 @@ -9153,7 +8928,7 @@ define void @mscatter_baseidx_zext_v8i8_v8f32(<8 x float> %val, ptr %base, <8 x ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB80_11 +; RV64ZVE32F-NEXT: beqz a1, .LBB80_14 ; RV64ZVE32F-NEXT: .LBB80_16: # %cond.store13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 @@ -9217,74 +8992,66 @@ define void @mscatter_baseidx_v8i16_v8f32(<8 x float> %val, ptr %base, <8 x i16> ; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 1 ; RV64ZVE32F-NEXT: vse32.v v11, (a2) ; RV64ZVE32F-NEXT: .LBB81_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB81_12 -; RV64ZVE32F-NEXT: # %bb.5: # %else4 -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB81_13 -; RV64ZVE32F-NEXT: .LBB81_6: # %else6 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB81_14 -; RV64ZVE32F-NEXT: .LBB81_7: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB81_9 -; RV64ZVE32F-NEXT: .LBB81_8: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vse32.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB81_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB81_15 -; RV64ZVE32F-NEXT: # %bb.10: # %else12 -; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: bnez a1, .LBB81_16 -; RV64ZVE32F-NEXT: .LBB81_11: # %else14 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB81_12: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB81_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB81_6: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB81_6 -; RV64ZVE32F-NEXT: .LBB81_13: # %cond.store5 +; RV64ZVE32F-NEXT: beqz a2, .LBB81_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 -; RV64ZVE32F-NEXT: vse32.v v10, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 3 +; RV64ZVE32F-NEXT: vse32.v v11, (a2) +; RV64ZVE32F-NEXT: .LBB81_8: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB81_7 -; RV64ZVE32F-NEXT: .LBB81_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB81_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB81_10: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB81_8 -; RV64ZVE32F-NEXT: j .LBB81_9 +; RV64ZVE32F-NEXT: beqz a2, .LBB81_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB81_12: # %else10 +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB81_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB81_16 +; RV64ZVE32F-NEXT: .LBB81_14: # %else14 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB81_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 @@ -9294,7 +9061,7 @@ define void @mscatter_baseidx_v8i16_v8f32(<8 x float> %val, ptr %base, <8 x i16> ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB81_11 +; RV64ZVE32F-NEXT: beqz a1, .LBB81_14 ; RV64ZVE32F-NEXT: .LBB81_16: # %cond.store13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 @@ -9356,74 +9123,66 @@ define void @mscatter_baseidx_sext_v8i16_v8f32(<8 x float> %val, ptr %base, <8 x ; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 1 ; RV64ZVE32F-NEXT: vse32.v v11, (a2) ; RV64ZVE32F-NEXT: .LBB82_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB82_12 -; RV64ZVE32F-NEXT: # %bb.5: # %else4 -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB82_13 -; RV64ZVE32F-NEXT: .LBB82_6: # %else6 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB82_14 -; RV64ZVE32F-NEXT: .LBB82_7: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB82_9 -; RV64ZVE32F-NEXT: .LBB82_8: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vse32.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB82_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB82_15 -; RV64ZVE32F-NEXT: # %bb.10: # %else12 -; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: bnez a1, .LBB82_16 -; RV64ZVE32F-NEXT: .LBB82_11: # %else14 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB82_12: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB82_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB82_6: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB82_6 -; RV64ZVE32F-NEXT: .LBB82_13: # %cond.store5 +; RV64ZVE32F-NEXT: beqz a2, .LBB82_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 -; RV64ZVE32F-NEXT: vse32.v v10, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 3 +; RV64ZVE32F-NEXT: vse32.v v11, (a2) +; RV64ZVE32F-NEXT: .LBB82_8: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB82_7 -; RV64ZVE32F-NEXT: .LBB82_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB82_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB82_10: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB82_8 -; RV64ZVE32F-NEXT: j .LBB82_9 +; RV64ZVE32F-NEXT: beqz a2, .LBB82_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB82_12: # %else10 +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB82_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB82_16 +; RV64ZVE32F-NEXT: .LBB82_14: # %else14 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB82_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 @@ -9433,7 +9192,7 @@ define void @mscatter_baseidx_sext_v8i16_v8f32(<8 x float> %val, ptr %base, <8 x ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB82_11 +; RV64ZVE32F-NEXT: beqz a1, .LBB82_14 ; RV64ZVE32F-NEXT: .LBB82_16: # %cond.store13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 @@ -9498,44 +9257,12 @@ define void @mscatter_baseidx_zext_v8i16_v8f32(<8 x float> %val, ptr %base, <8 x ; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 1 ; RV64ZVE32F-NEXT: vse32.v v11, (a2) ; RV64ZVE32F-NEXT: .LBB83_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB83_12 -; RV64ZVE32F-NEXT: # %bb.5: # %else4 -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB83_13 -; RV64ZVE32F-NEXT: .LBB83_6: # %else6 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB83_14 -; RV64ZVE32F-NEXT: .LBB83_7: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB83_9 -; RV64ZVE32F-NEXT: .LBB83_8: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: slli a2, a2, 48 -; RV64ZVE32F-NEXT: srli a2, a2, 46 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vse32.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB83_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB83_15 -; RV64ZVE32F-NEXT: # %bb.10: # %else12 -; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: bnez a1, .LBB83_16 -; RV64ZVE32F-NEXT: .LBB83_11: # %else14 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB83_12: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB83_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 48 ; RV64ZVE32F-NEXT: srli a2, a2, 46 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -9543,23 +9270,26 @@ define void @mscatter_baseidx_zext_v8i16_v8f32(<8 x float> %val, ptr %base, <8 x ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB83_6: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB83_6 -; RV64ZVE32F-NEXT: .LBB83_13: # %cond.store5 +; RV64ZVE32F-NEXT: beqz a2, .LBB83_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 48 ; RV64ZVE32F-NEXT: srli a2, a2, 46 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 -; RV64ZVE32F-NEXT: vse32.v v10, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 3 +; RV64ZVE32F-NEXT: vse32.v v11, (a2) +; RV64ZVE32F-NEXT: .LBB83_8: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB83_7 -; RV64ZVE32F-NEXT: .LBB83_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB83_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 48 ; RV64ZVE32F-NEXT: srli a2, a2, 46 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -9567,9 +9297,30 @@ define void @mscatter_baseidx_zext_v8i16_v8f32(<8 x float> %val, ptr %base, <8 x ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB83_10: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB83_8 -; RV64ZVE32F-NEXT: j .LBB83_9 +; RV64ZVE32F-NEXT: beqz a2, .LBB83_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: slli a2, a2, 48 +; RV64ZVE32F-NEXT: srli a2, a2, 46 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB83_12: # %else10 +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB83_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB83_16 +; RV64ZVE32F-NEXT: .LBB83_14: # %else14 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB83_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 48 @@ -9580,7 +9331,7 @@ define void @mscatter_baseidx_zext_v8i16_v8f32(<8 x float> %val, ptr %base, <8 x ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB83_11 +; RV64ZVE32F-NEXT: beqz a1, .LBB83_14 ; RV64ZVE32F-NEXT: .LBB83_16: # %cond.store13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 @@ -9639,71 +9390,69 @@ define void @mscatter_baseidx_v8f32(<8 x float> %val, ptr %base, <8 x i32> %idxs ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: .LBB84_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB84_12 -; RV64ZVE32F-NEXT: # %bb.5: # %else4 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB84_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vslidedown.vi v13, v8, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vse32.v v13, (a2) +; RV64ZVE32F-NEXT: .LBB84_6: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB84_13 -; RV64ZVE32F-NEXT: .LBB84_6: # %else6 +; RV64ZVE32F-NEXT: # %bb.7: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: bnez a2, .LBB84_14 -; RV64ZVE32F-NEXT: .LBB84_7: # %else8 +; RV64ZVE32F-NEXT: .LBB84_8: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB84_9 -; RV64ZVE32F-NEXT: .LBB84_8: # %cond.store9 +; RV64ZVE32F-NEXT: beqz a2, .LBB84_10 +; RV64ZVE32F-NEXT: .LBB84_9: # %cond.store9 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v12, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vse32.v v10, (a2) -; RV64ZVE32F-NEXT: .LBB84_9: # %else10 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB84_10: # %else10 ; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v12, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB84_15 -; RV64ZVE32F-NEXT: # %bb.10: # %else12 +; RV64ZVE32F-NEXT: # %bb.11: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB84_16 -; RV64ZVE32F-NEXT: .LBB84_11: # %else14 +; RV64ZVE32F-NEXT: .LBB84_12: # %else14 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB84_12: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vse32.v v11, (a2) -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB84_6 ; RV64ZVE32F-NEXT: .LBB84_13: # %cond.store5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 -; RV64ZVE32F-NEXT: vse32.v v10, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 3 +; RV64ZVE32F-NEXT: vse32.v v11, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB84_7 +; RV64ZVE32F-NEXT: beqz a2, .LBB84_8 ; RV64ZVE32F-NEXT: .LBB84_14: # %cond.store7 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vse32.v v10, (a2) +; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB84_8 -; RV64ZVE32F-NEXT: j .LBB84_9 +; RV64ZVE32F-NEXT: bnez a2, .LBB84_9 +; RV64ZVE32F-NEXT: j .LBB84_10 ; RV64ZVE32F-NEXT: .LBB84_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 @@ -9713,7 +9462,7 @@ define void @mscatter_baseidx_v8f32(<8 x float> %val, ptr %base, <8 x i32> %idxs ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB84_11 +; RV64ZVE32F-NEXT: beqz a1, .LBB84_12 ; RV64ZVE32F-NEXT: .LBB84_16: # %cond.store13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 @@ -10281,66 +10030,61 @@ define void @mscatter_baseidx_v8i8_v8f64(<8 x double> %val, ptr %base, <8 x i8> ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa1, 0(a2) ; RV64ZVE32F-NEXT: .LBB91_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB91_12 -; RV64ZVE32F-NEXT: # %bb.5: # %else4 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB91_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa2, 0(a2) +; RV64ZVE32F-NEXT: .LBB91_6: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB91_13 -; RV64ZVE32F-NEXT: .LBB91_6: # %else6 +; RV64ZVE32F-NEXT: beqz a2, .LBB91_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa3, 0(a2) +; RV64ZVE32F-NEXT: .LBB91_8: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB91_14 -; RV64ZVE32F-NEXT: .LBB91_7: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB91_9 -; RV64ZVE32F-NEXT: .LBB91_8: # %cond.store9 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB91_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: fsd fa5, 0(a2) -; RV64ZVE32F-NEXT: .LBB91_9: # %else10 +; RV64ZVE32F-NEXT: fsd fa4, 0(a2) +; RV64ZVE32F-NEXT: .LBB91_10: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB91_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa5, 0(a2) +; RV64ZVE32F-NEXT: .LBB91_12: # %else10 ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB91_15 -; RV64ZVE32F-NEXT: # %bb.10: # %else12 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB91_16 -; RV64ZVE32F-NEXT: .LBB91_11: # %else14 +; RV64ZVE32F-NEXT: .LBB91_14: # %else14 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB91_12: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: slli a2, a2, 3 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: fsd fa2, 0(a2) -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB91_6 -; RV64ZVE32F-NEXT: .LBB91_13: # %cond.store5 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: slli a2, a2, 3 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: fsd fa3, 0(a2) -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB91_7 -; RV64ZVE32F-NEXT: .LBB91_14: # %cond.store7 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 3 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: fsd fa4, 0(a2) -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB91_8 -; RV64ZVE32F-NEXT: j .LBB91_9 ; RV64ZVE32F-NEXT: .LBB91_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa6, 0(a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB91_11 +; RV64ZVE32F-NEXT: beqz a1, .LBB91_14 ; RV64ZVE32F-NEXT: .LBB91_16: # %cond.store13 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v8 @@ -10481,66 +10225,61 @@ define void @mscatter_baseidx_sext_v8i8_v8f64(<8 x double> %val, ptr %base, <8 x ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa1, 0(a2) ; RV64ZVE32F-NEXT: .LBB92_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB92_12 -; RV64ZVE32F-NEXT: # %bb.5: # %else4 -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB92_13 -; RV64ZVE32F-NEXT: .LBB92_6: # %else6 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB92_14 -; RV64ZVE32F-NEXT: .LBB92_7: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB92_9 -; RV64ZVE32F-NEXT: .LBB92_8: # %cond.store9 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: slli a2, a2, 3 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: fsd fa5, 0(a2) -; RV64ZVE32F-NEXT: .LBB92_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB92_15 -; RV64ZVE32F-NEXT: # %bb.10: # %else12 -; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: bnez a1, .LBB92_16 -; RV64ZVE32F-NEXT: .LBB92_11: # %else14 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB92_12: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa2, 0(a2) +; RV64ZVE32F-NEXT: .LBB92_6: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB92_6 -; RV64ZVE32F-NEXT: .LBB92_13: # %cond.store5 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa3, 0(a2) +; RV64ZVE32F-NEXT: .LBB92_8: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB92_7 -; RV64ZVE32F-NEXT: .LBB92_14: # %cond.store7 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa4, 0(a2) +; RV64ZVE32F-NEXT: .LBB92_10: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB92_8 -; RV64ZVE32F-NEXT: j .LBB92_9 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa5, 0(a2) +; RV64ZVE32F-NEXT: .LBB92_12: # %else10 +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB92_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB92_16 +; RV64ZVE32F-NEXT: .LBB92_14: # %else14 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB92_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa6, 0(a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB92_11 +; RV64ZVE32F-NEXT: beqz a1, .LBB92_14 ; RV64ZVE32F-NEXT: .LBB92_16: # %cond.store13 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v8 @@ -10685,63 +10424,58 @@ define void @mscatter_baseidx_zext_v8i8_v8f64(<8 x double> %val, ptr %base, <8 x ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa1, 0(a2) ; RV64ZVE32F-NEXT: .LBB93_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB93_12 -; RV64ZVE32F-NEXT: # %bb.5: # %else4 -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB93_13 -; RV64ZVE32F-NEXT: .LBB93_6: # %else6 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB93_14 -; RV64ZVE32F-NEXT: .LBB93_7: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB93_9 -; RV64ZVE32F-NEXT: .LBB93_8: # %cond.store9 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: zext.b a2, a2 -; RV64ZVE32F-NEXT: slli a2, a2, 3 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: fsd fa5, 0(a2) -; RV64ZVE32F-NEXT: .LBB93_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB93_15 -; RV64ZVE32F-NEXT: # %bb.10: # %else12 -; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: bnez a1, .LBB93_16 -; RV64ZVE32F-NEXT: .LBB93_11: # %else14 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB93_12: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB93_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: zext.b a2, a2 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa2, 0(a2) +; RV64ZVE32F-NEXT: .LBB93_6: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB93_6 -; RV64ZVE32F-NEXT: .LBB93_13: # %cond.store5 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: beqz a2, .LBB93_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: zext.b a2, a2 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa3, 0(a2) +; RV64ZVE32F-NEXT: .LBB93_8: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB93_7 -; RV64ZVE32F-NEXT: .LBB93_14: # %cond.store7 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB93_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: zext.b a2, a2 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa4, 0(a2) +; RV64ZVE32F-NEXT: .LBB93_10: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB93_8 -; RV64ZVE32F-NEXT: j .LBB93_9 +; RV64ZVE32F-NEXT: beqz a2, .LBB93_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: zext.b a2, a2 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa5, 0(a2) +; RV64ZVE32F-NEXT: .LBB93_12: # %else10 +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB93_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB93_16 +; RV64ZVE32F-NEXT: .LBB93_14: # %else14 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB93_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: zext.b a2, a2 @@ -10749,7 +10483,7 @@ define void @mscatter_baseidx_zext_v8i8_v8f64(<8 x double> %val, ptr %base, <8 x ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa6, 0(a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB93_11 +; RV64ZVE32F-NEXT: beqz a1, .LBB93_14 ; RV64ZVE32F-NEXT: .LBB93_16: # %cond.store13 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v8 @@ -10894,66 +10628,61 @@ define void @mscatter_baseidx_v8i16_v8f64(<8 x double> %val, ptr %base, <8 x i16 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa1, 0(a2) ; RV64ZVE32F-NEXT: .LBB94_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB94_12 -; RV64ZVE32F-NEXT: # %bb.5: # %else4 -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB94_13 -; RV64ZVE32F-NEXT: .LBB94_6: # %else6 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB94_14 -; RV64ZVE32F-NEXT: .LBB94_7: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB94_9 -; RV64ZVE32F-NEXT: .LBB94_8: # %cond.store9 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: slli a2, a2, 3 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: fsd fa5, 0(a2) -; RV64ZVE32F-NEXT: .LBB94_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB94_15 -; RV64ZVE32F-NEXT: # %bb.10: # %else12 -; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: bnez a1, .LBB94_16 -; RV64ZVE32F-NEXT: .LBB94_11: # %else14 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB94_12: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB94_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa2, 0(a2) +; RV64ZVE32F-NEXT: .LBB94_6: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB94_6 -; RV64ZVE32F-NEXT: .LBB94_13: # %cond.store5 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: beqz a2, .LBB94_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa3, 0(a2) +; RV64ZVE32F-NEXT: .LBB94_8: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB94_7 -; RV64ZVE32F-NEXT: .LBB94_14: # %cond.store7 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB94_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa4, 0(a2) +; RV64ZVE32F-NEXT: .LBB94_10: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB94_8 -; RV64ZVE32F-NEXT: j .LBB94_9 +; RV64ZVE32F-NEXT: beqz a2, .LBB94_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa5, 0(a2) +; RV64ZVE32F-NEXT: .LBB94_12: # %else10 +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB94_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB94_16 +; RV64ZVE32F-NEXT: .LBB94_14: # %else14 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB94_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa6, 0(a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB94_11 +; RV64ZVE32F-NEXT: beqz a1, .LBB94_14 ; RV64ZVE32F-NEXT: .LBB94_16: # %cond.store13 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v8 @@ -11096,66 +10825,61 @@ define void @mscatter_baseidx_sext_v8i16_v8f64(<8 x double> %val, ptr %base, <8 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa1, 0(a2) ; RV64ZVE32F-NEXT: .LBB95_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB95_12 -; RV64ZVE32F-NEXT: # %bb.5: # %else4 -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB95_13 -; RV64ZVE32F-NEXT: .LBB95_6: # %else6 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB95_14 -; RV64ZVE32F-NEXT: .LBB95_7: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB95_9 -; RV64ZVE32F-NEXT: .LBB95_8: # %cond.store9 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: slli a2, a2, 3 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: fsd fa5, 0(a2) -; RV64ZVE32F-NEXT: .LBB95_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB95_15 -; RV64ZVE32F-NEXT: # %bb.10: # %else12 -; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: bnez a1, .LBB95_16 -; RV64ZVE32F-NEXT: .LBB95_11: # %else14 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB95_12: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB95_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa2, 0(a2) +; RV64ZVE32F-NEXT: .LBB95_6: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB95_6 -; RV64ZVE32F-NEXT: .LBB95_13: # %cond.store5 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: beqz a2, .LBB95_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa3, 0(a2) +; RV64ZVE32F-NEXT: .LBB95_8: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB95_7 -; RV64ZVE32F-NEXT: .LBB95_14: # %cond.store7 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB95_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa4, 0(a2) +; RV64ZVE32F-NEXT: .LBB95_10: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB95_8 -; RV64ZVE32F-NEXT: j .LBB95_9 +; RV64ZVE32F-NEXT: beqz a2, .LBB95_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa5, 0(a2) +; RV64ZVE32F-NEXT: .LBB95_12: # %else10 +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB95_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB95_16 +; RV64ZVE32F-NEXT: .LBB95_14: # %else14 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB95_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa6, 0(a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB95_11 +; RV64ZVE32F-NEXT: beqz a1, .LBB95_14 ; RV64ZVE32F-NEXT: .LBB95_16: # %cond.store13 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v8 @@ -11302,63 +11026,58 @@ define void @mscatter_baseidx_zext_v8i16_v8f64(<8 x double> %val, ptr %base, <8 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa1, 0(a2) ; RV64ZVE32F-NEXT: .LBB96_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB96_12 -; RV64ZVE32F-NEXT: # %bb.5: # %else4 -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB96_13 -; RV64ZVE32F-NEXT: .LBB96_6: # %else6 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB96_14 -; RV64ZVE32F-NEXT: .LBB96_7: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB96_9 -; RV64ZVE32F-NEXT: .LBB96_8: # %cond.store9 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: slli a2, a2, 48 -; RV64ZVE32F-NEXT: srli a2, a2, 45 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: fsd fa5, 0(a2) -; RV64ZVE32F-NEXT: .LBB96_9: # %else10 -; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB96_15 -; RV64ZVE32F-NEXT: # %bb.10: # %else12 -; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: bnez a1, .LBB96_16 -; RV64ZVE32F-NEXT: .LBB96_11: # %else14 -; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB96_12: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB96_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 48 ; RV64ZVE32F-NEXT: srli a2, a2, 45 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa2, 0(a2) +; RV64ZVE32F-NEXT: .LBB96_6: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB96_6 -; RV64ZVE32F-NEXT: .LBB96_13: # %cond.store5 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: beqz a2, .LBB96_8 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 48 ; RV64ZVE32F-NEXT: srli a2, a2, 45 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa3, 0(a2) +; RV64ZVE32F-NEXT: .LBB96_8: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB96_7 -; RV64ZVE32F-NEXT: .LBB96_14: # %cond.store7 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB96_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 48 ; RV64ZVE32F-NEXT: srli a2, a2, 45 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa4, 0(a2) +; RV64ZVE32F-NEXT: .LBB96_10: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB96_8 -; RV64ZVE32F-NEXT: j .LBB96_9 +; RV64ZVE32F-NEXT: beqz a2, .LBB96_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 48 +; RV64ZVE32F-NEXT: srli a2, a2, 45 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa5, 0(a2) +; RV64ZVE32F-NEXT: .LBB96_12: # %else10 +; RV64ZVE32F-NEXT: andi a2, a1, 64 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB96_15 +; RV64ZVE32F-NEXT: # %bb.13: # %else12 +; RV64ZVE32F-NEXT: andi a1, a1, -128 +; RV64ZVE32F-NEXT: bnez a1, .LBB96_16 +; RV64ZVE32F-NEXT: .LBB96_14: # %else14 +; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB96_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 48 @@ -11366,7 +11085,7 @@ define void @mscatter_baseidx_zext_v8i16_v8f64(<8 x double> %val, ptr %base, <8 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa6, 0(a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB96_11 +; RV64ZVE32F-NEXT: beqz a1, .LBB96_14 ; RV64ZVE32F-NEXT: .LBB96_16: # %cond.store13 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v8 @@ -11509,66 +11228,68 @@ define void @mscatter_baseidx_v8i32_v8f64(<8 x double> %val, ptr %base, <8 x i32 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa1, 0(a2) ; RV64ZVE32F-NEXT: .LBB97_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB97_12 -; RV64ZVE32F-NEXT: # %bb.5: # %else4 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB97_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa2, 0(a2) +; RV64ZVE32F-NEXT: .LBB97_6: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB97_13 -; RV64ZVE32F-NEXT: .LBB97_6: # %else6 +; RV64ZVE32F-NEXT: # %bb.7: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: bnez a2, .LBB97_14 -; RV64ZVE32F-NEXT: .LBB97_7: # %else8 +; RV64ZVE32F-NEXT: .LBB97_8: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB97_9 -; RV64ZVE32F-NEXT: .LBB97_8: # %cond.store9 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: beqz a2, .LBB97_10 +; RV64ZVE32F-NEXT: .LBB97_9: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa5, 0(a2) -; RV64ZVE32F-NEXT: .LBB97_9: # %else10 +; RV64ZVE32F-NEXT: .LBB97_10: # %else10 ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB97_15 -; RV64ZVE32F-NEXT: # %bb.10: # %else12 +; RV64ZVE32F-NEXT: # %bb.11: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB97_16 -; RV64ZVE32F-NEXT: .LBB97_11: # %else14 +; RV64ZVE32F-NEXT: .LBB97_12: # %else14 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB97_12: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: slli a2, a2, 3 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: fsd fa2, 0(a2) -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB97_6 -; RV64ZVE32F-NEXT: .LBB97_13: # %cond.store5 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: .LBB97_13: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa3, 0(a2) ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB97_7 +; RV64ZVE32F-NEXT: beqz a2, .LBB97_8 ; RV64ZVE32F-NEXT: .LBB97_14: # %cond.store7 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa4, 0(a2) ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB97_8 -; RV64ZVE32F-NEXT: j .LBB97_9 +; RV64ZVE32F-NEXT: bnez a2, .LBB97_9 +; RV64ZVE32F-NEXT: j .LBB97_10 ; RV64ZVE32F-NEXT: .LBB97_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa6, 0(a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB97_11 +; RV64ZVE32F-NEXT: beqz a1, .LBB97_12 ; RV64ZVE32F-NEXT: .LBB97_16: # %cond.store13 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v8 @@ -11709,66 +11430,68 @@ define void @mscatter_baseidx_sext_v8i32_v8f64(<8 x double> %val, ptr %base, <8 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa1, 0(a2) ; RV64ZVE32F-NEXT: .LBB98_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB98_12 -; RV64ZVE32F-NEXT: # %bb.5: # %else4 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa2, 0(a2) +; RV64ZVE32F-NEXT: .LBB98_6: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB98_13 -; RV64ZVE32F-NEXT: .LBB98_6: # %else6 +; RV64ZVE32F-NEXT: # %bb.7: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: bnez a2, .LBB98_14 -; RV64ZVE32F-NEXT: .LBB98_7: # %else8 +; RV64ZVE32F-NEXT: .LBB98_8: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB98_9 -; RV64ZVE32F-NEXT: .LBB98_8: # %cond.store9 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_10 +; RV64ZVE32F-NEXT: .LBB98_9: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa5, 0(a2) -; RV64ZVE32F-NEXT: .LBB98_9: # %else10 +; RV64ZVE32F-NEXT: .LBB98_10: # %else10 ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB98_15 -; RV64ZVE32F-NEXT: # %bb.10: # %else12 +; RV64ZVE32F-NEXT: # %bb.11: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB98_16 -; RV64ZVE32F-NEXT: .LBB98_11: # %else14 +; RV64ZVE32F-NEXT: .LBB98_12: # %else14 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB98_12: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: slli a2, a2, 3 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: fsd fa2, 0(a2) -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB98_6 ; RV64ZVE32F-NEXT: .LBB98_13: # %cond.store5 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa3, 0(a2) ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB98_7 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_8 ; RV64ZVE32F-NEXT: .LBB98_14: # %cond.store7 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa4, 0(a2) ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB98_8 -; RV64ZVE32F-NEXT: j .LBB98_9 +; RV64ZVE32F-NEXT: bnez a2, .LBB98_9 +; RV64ZVE32F-NEXT: j .LBB98_10 ; RV64ZVE32F-NEXT: .LBB98_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa6, 0(a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB98_11 +; RV64ZVE32F-NEXT: beqz a1, .LBB98_12 ; RV64ZVE32F-NEXT: .LBB98_16: # %cond.store13 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v8 @@ -11912,63 +11635,65 @@ define void @mscatter_baseidx_zext_v8i32_v8f64(<8 x double> %val, ptr %base, <8 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa1, 0(a2) ; RV64ZVE32F-NEXT: .LBB99_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB99_12 -; RV64ZVE32F-NEXT: # %bb.5: # %else4 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB99_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 32 +; RV64ZVE32F-NEXT: srli a2, a2, 29 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa2, 0(a2) +; RV64ZVE32F-NEXT: .LBB99_6: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB99_13 -; RV64ZVE32F-NEXT: .LBB99_6: # %else6 +; RV64ZVE32F-NEXT: # %bb.7: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: bnez a2, .LBB99_14 -; RV64ZVE32F-NEXT: .LBB99_7: # %else8 +; RV64ZVE32F-NEXT: .LBB99_8: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB99_9 -; RV64ZVE32F-NEXT: .LBB99_8: # %cond.store9 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: beqz a2, .LBB99_10 +; RV64ZVE32F-NEXT: .LBB99_9: # %cond.store9 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 32 ; RV64ZVE32F-NEXT: srli a2, a2, 29 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa5, 0(a2) -; RV64ZVE32F-NEXT: .LBB99_9: # %else10 +; RV64ZVE32F-NEXT: .LBB99_10: # %else10 ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB99_15 -; RV64ZVE32F-NEXT: # %bb.10: # %else12 +; RV64ZVE32F-NEXT: # %bb.11: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB99_16 -; RV64ZVE32F-NEXT: .LBB99_11: # %else14 +; RV64ZVE32F-NEXT: .LBB99_12: # %else14 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB99_12: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 -; RV64ZVE32F-NEXT: slli a2, a2, 32 -; RV64ZVE32F-NEXT: srli a2, a2, 29 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: fsd fa2, 0(a2) -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB99_6 ; RV64ZVE32F-NEXT: .LBB99_13: # %cond.store5 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 32 ; RV64ZVE32F-NEXT: srli a2, a2, 29 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa3, 0(a2) ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB99_7 +; RV64ZVE32F-NEXT: beqz a2, .LBB99_8 ; RV64ZVE32F-NEXT: .LBB99_14: # %cond.store7 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 32 ; RV64ZVE32F-NEXT: srli a2, a2, 29 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa4, 0(a2) ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB99_8 -; RV64ZVE32F-NEXT: j .LBB99_9 +; RV64ZVE32F-NEXT: bnez a2, .LBB99_9 +; RV64ZVE32F-NEXT: j .LBB99_10 ; RV64ZVE32F-NEXT: .LBB99_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 32 @@ -11976,7 +11701,7 @@ define void @mscatter_baseidx_zext_v8i32_v8f64(<8 x double> %val, ptr %base, <8 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa6, 0(a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB99_11 +; RV64ZVE32F-NEXT: beqz a1, .LBB99_12 ; RV64ZVE32F-NEXT: .LBB99_16: # %cond.store13 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v8 @@ -12242,30 +11967,42 @@ define void @mscatter_baseidx_v16i8(<16 x i8> %val, ptr %base, <16 x i8> %idxs, ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV64ZVE32F-NEXT: vse8.v v10, (a2) ; RV64ZVE32F-NEXT: .LBB101_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB101_25 -; RV64ZVE32F-NEXT: # %bb.5: # %else4 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB101_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 +; RV64ZVE32F-NEXT: vse8.v v11, (a2) +; RV64ZVE32F-NEXT: .LBB101_6: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB101_26 -; RV64ZVE32F-NEXT: .LBB101_6: # %else6 -; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: beqz a2, .LBB101_8 -; RV64ZVE32F-NEXT: .LBB101_7: # %cond.store7 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV64ZVE32F-NEXT: vse8.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB101_8: # %else6 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 +; RV64ZVE32F-NEXT: beqz a2, .LBB101_10 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 4 ; RV64ZVE32F-NEXT: vse8.v v11, (a2) -; RV64ZVE32F-NEXT: .LBB101_8: # %else8 +; RV64ZVE32F-NEXT: .LBB101_10: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB101_10 -; RV64ZVE32F-NEXT: # %bb.9: # %cond.store9 +; RV64ZVE32F-NEXT: beqz a2, .LBB101_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v11 @@ -12273,21 +12010,43 @@ define void @mscatter_baseidx_v16i8(<16 x i8> %val, ptr %base, <16 x i8> %idxs, ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 5 ; RV64ZVE32F-NEXT: vse8.v v11, (a2) -; RV64ZVE32F-NEXT: .LBB101_10: # %else10 +; RV64ZVE32F-NEXT: .LBB101_12: # %else10 ; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB101_27 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: beqz a2, .LBB101_14 +; RV64ZVE32F-NEXT: # %bb.13: # %cond.store11 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 6 +; RV64ZVE32F-NEXT: vse8.v v11, (a2) +; RV64ZVE32F-NEXT: .LBB101_14: # %else12 ; RV64ZVE32F-NEXT: andi a2, a1, 128 -; RV64ZVE32F-NEXT: bnez a2, .LBB101_28 -; RV64ZVE32F-NEXT: .LBB101_12: # %else14 +; RV64ZVE32F-NEXT: beqz a2, .LBB101_16 +; RV64ZVE32F-NEXT: # %bb.15: # %cond.store13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 7 +; RV64ZVE32F-NEXT: vse8.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB101_16: # %else14 ; RV64ZVE32F-NEXT: andi a2, a1, 256 -; RV64ZVE32F-NEXT: bnez a2, .LBB101_29 -; RV64ZVE32F-NEXT: .LBB101_13: # %else16 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB101_18 +; RV64ZVE32F-NEXT: # %bb.17: # %cond.store15 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 8 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64ZVE32F-NEXT: vse8.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB101_18: # %else16 ; RV64ZVE32F-NEXT: andi a2, a1, 512 -; RV64ZVE32F-NEXT: beqz a2, .LBB101_15 -; RV64ZVE32F-NEXT: .LBB101_14: # %cond.store17 +; RV64ZVE32F-NEXT: beqz a2, .LBB101_20 +; RV64ZVE32F-NEXT: # %bb.19: # %cond.store17 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 @@ -12295,46 +12054,66 @@ define void @mscatter_baseidx_v16i8(<16 x i8> %val, ptr %base, <16 x i8> %idxs, ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 9 ; RV64ZVE32F-NEXT: vse8.v v10, (a2) -; RV64ZVE32F-NEXT: .LBB101_15: # %else18 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 +; RV64ZVE32F-NEXT: .LBB101_20: # %else18 ; RV64ZVE32F-NEXT: andi a2, a1, 1024 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB101_30 -; RV64ZVE32F-NEXT: # %bb.16: # %else20 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB101_22 +; RV64ZVE32F-NEXT: # %bb.21: # %cond.store19 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 10 +; RV64ZVE32F-NEXT: vse8.v v11, (a2) +; RV64ZVE32F-NEXT: .LBB101_22: # %else20 ; RV64ZVE32F-NEXT: slli a2, a1, 52 -; RV64ZVE32F-NEXT: bltz a2, .LBB101_31 -; RV64ZVE32F-NEXT: .LBB101_17: # %else22 +; RV64ZVE32F-NEXT: bgez a2, .LBB101_24 +; RV64ZVE32F-NEXT: # %bb.23: # %cond.store21 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 11 +; RV64ZVE32F-NEXT: vse8.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB101_24: # %else22 ; RV64ZVE32F-NEXT: slli a2, a1, 51 -; RV64ZVE32F-NEXT: bltz a2, .LBB101_32 -; RV64ZVE32F-NEXT: .LBB101_18: # %else24 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 4 +; RV64ZVE32F-NEXT: bgez a2, .LBB101_26 +; RV64ZVE32F-NEXT: # %bb.25: # %cond.store23 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 12 +; RV64ZVE32F-NEXT: vse8.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB101_26: # %else24 ; RV64ZVE32F-NEXT: slli a2, a1, 50 -; RV64ZVE32F-NEXT: bgez a2, .LBB101_20 -; RV64ZVE32F-NEXT: .LBB101_19: # %cond.store25 +; RV64ZVE32F-NEXT: bgez a2, .LBB101_28 +; RV64ZVE32F-NEXT: # %bb.27: # %cond.store25 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 13 -; RV64ZVE32F-NEXT: vse8.v v9, (a2) -; RV64ZVE32F-NEXT: .LBB101_20: # %else26 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 13 +; RV64ZVE32F-NEXT: vse8.v v10, (a2) +; RV64ZVE32F-NEXT: .LBB101_28: # %else26 ; RV64ZVE32F-NEXT: slli a2, a1, 49 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 2 -; RV64ZVE32F-NEXT: bgez a2, .LBB101_22 -; RV64ZVE32F-NEXT: # %bb.21: # %cond.store27 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: bgez a2, .LBB101_30 +; RV64ZVE32F-NEXT: # %bb.29: # %cond.store27 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 14 ; RV64ZVE32F-NEXT: vse8.v v10, (a2) -; RV64ZVE32F-NEXT: .LBB101_22: # %else28 +; RV64ZVE32F-NEXT: .LBB101_30: # %else28 ; RV64ZVE32F-NEXT: lui a2, 1048568 ; RV64ZVE32F-NEXT: and a1, a1, a2 -; RV64ZVE32F-NEXT: beqz a1, .LBB101_24 -; RV64ZVE32F-NEXT: # %bb.23: # %cond.store29 +; RV64ZVE32F-NEXT: beqz a1, .LBB101_32 +; RV64ZVE32F-NEXT: # %bb.31: # %cond.store29 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v9 @@ -12342,81 +12121,8 @@ define void @mscatter_baseidx_v16i8(<16 x i8> %val, ptr %base, <16 x i8> %idxs, ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 15 ; RV64ZVE32F-NEXT: vse8.v v8, (a0) -; RV64ZVE32F-NEXT: .LBB101_24: # %else30 +; RV64ZVE32F-NEXT: .LBB101_32: # %else30 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB101_25: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 -; RV64ZVE32F-NEXT: vse8.v v12, (a2) -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB101_6 -; RV64ZVE32F-NEXT: .LBB101_26: # %cond.store5 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 3 -; RV64ZVE32F-NEXT: vse8.v v11, (a2) -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB101_7 -; RV64ZVE32F-NEXT: j .LBB101_8 -; RV64ZVE32F-NEXT: .LBB101_27: # %cond.store11 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 6 -; RV64ZVE32F-NEXT: vse8.v v11, (a2) -; RV64ZVE32F-NEXT: andi a2, a1, 128 -; RV64ZVE32F-NEXT: beqz a2, .LBB101_12 -; RV64ZVE32F-NEXT: .LBB101_28: # %cond.store13 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 7 -; RV64ZVE32F-NEXT: vse8.v v10, (a2) -; RV64ZVE32F-NEXT: andi a2, a1, 256 -; RV64ZVE32F-NEXT: beqz a2, .LBB101_13 -; RV64ZVE32F-NEXT: .LBB101_29: # %cond.store15 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 8 -; RV64ZVE32F-NEXT: vse8.v v10, (a2) -; RV64ZVE32F-NEXT: andi a2, a1, 512 -; RV64ZVE32F-NEXT: bnez a2, .LBB101_14 -; RV64ZVE32F-NEXT: j .LBB101_15 -; RV64ZVE32F-NEXT: .LBB101_30: # %cond.store19 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 10 -; RV64ZVE32F-NEXT: vse8.v v11, (a2) -; RV64ZVE32F-NEXT: slli a2, a1, 52 -; RV64ZVE32F-NEXT: bgez a2, .LBB101_17 -; RV64ZVE32F-NEXT: .LBB101_31: # %cond.store21 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 11 -; RV64ZVE32F-NEXT: vse8.v v9, (a2) -; RV64ZVE32F-NEXT: slli a2, a1, 51 -; RV64ZVE32F-NEXT: bgez a2, .LBB101_18 -; RV64ZVE32F-NEXT: .LBB101_32: # %cond.store23 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 12 -; RV64ZVE32F-NEXT: vse8.v v9, (a2) -; RV64ZVE32F-NEXT: slli a2, a1, 50 -; RV64ZVE32F-NEXT: bltz a2, .LBB101_19 -; RV64ZVE32F-NEXT: j .LBB101_20 %ptrs = getelementptr inbounds i8, ptr %base, <16 x i8> %idxs call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %val, <16 x ptr> %ptrs, i32 1, <16 x i1> %m) ret void @@ -12474,52 +12180,86 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs, ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 ; RV64ZVE32F-NEXT: vse8.v v12, (a2) ; RV64ZVE32F-NEXT: .LBB102_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v13, v10, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB102_49 -; RV64ZVE32F-NEXT: # %bb.5: # %else4 +; RV64ZVE32F-NEXT: beqz a2, .LBB102_6 +; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v13, v8, 2 +; RV64ZVE32F-NEXT: vse8.v v13, (a2) +; RV64ZVE32F-NEXT: .LBB102_6: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: bnez a2, .LBB102_50 -; RV64ZVE32F-NEXT: .LBB102_6: # %else6 -; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: beqz a2, .LBB102_8 -; RV64ZVE32F-NEXT: .LBB102_7: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v13 +; RV64ZVE32F-NEXT: # %bb.7: # %cond.store5 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 3 ; RV64ZVE32F-NEXT: vse8.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB102_8: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 8 +; RV64ZVE32F-NEXT: .LBB102_8: # %else6 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 4 ; RV64ZVE32F-NEXT: beqz a2, .LBB102_10 -; RV64ZVE32F-NEXT: # %bb.9: # %cond.store9 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store7 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v13, v8, 4 +; RV64ZVE32F-NEXT: vse8.v v13, (a2) +; RV64ZVE32F-NEXT: .LBB102_10: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: beqz a2, .LBB102_12 +; RV64ZVE32F-NEXT: # %bb.11: # %cond.store9 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v14, v13, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v14 +; RV64ZVE32F-NEXT: vslidedown.vi v13, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v13 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 5 -; RV64ZVE32F-NEXT: vse8.v v14, (a2) -; RV64ZVE32F-NEXT: .LBB102_10: # %else10 +; RV64ZVE32F-NEXT: vslidedown.vi v13, v8, 5 +; RV64ZVE32F-NEXT: vse8.v v13, (a2) +; RV64ZVE32F-NEXT: .LBB102_12: # %else10 ; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v13, v13, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB102_51 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB102_14 +; RV64ZVE32F-NEXT: # %bb.13: # %cond.store11 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v13, v8, 6 +; RV64ZVE32F-NEXT: vse8.v v13, (a2) +; RV64ZVE32F-NEXT: .LBB102_14: # %else12 ; RV64ZVE32F-NEXT: andi a2, a1, 128 -; RV64ZVE32F-NEXT: bnez a2, .LBB102_52 -; RV64ZVE32F-NEXT: .LBB102_12: # %else14 +; RV64ZVE32F-NEXT: beqz a2, .LBB102_16 +; RV64ZVE32F-NEXT: # %bb.15: # %cond.store13 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 7 +; RV64ZVE32F-NEXT: vse8.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB102_16: # %else14 ; RV64ZVE32F-NEXT: andi a2, a1, 256 -; RV64ZVE32F-NEXT: bnez a2, .LBB102_53 -; RV64ZVE32F-NEXT: .LBB102_13: # %else16 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB102_18 +; RV64ZVE32F-NEXT: # %bb.17: # %cond.store15 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vslidedown.vi v13, v8, 8 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64ZVE32F-NEXT: vse8.v v13, (a2) +; RV64ZVE32F-NEXT: .LBB102_18: # %else16 ; RV64ZVE32F-NEXT: andi a2, a1, 512 -; RV64ZVE32F-NEXT: beqz a2, .LBB102_15 -; RV64ZVE32F-NEXT: .LBB102_14: # %cond.store17 +; RV64ZVE32F-NEXT: beqz a2, .LBB102_20 +; RV64ZVE32F-NEXT: # %bb.19: # %cond.store17 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v13, v12, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v13 @@ -12527,67 +12267,73 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs, ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v13, v8, 9 ; RV64ZVE32F-NEXT: vse8.v v13, (a2) -; RV64ZVE32F-NEXT: .LBB102_15: # %else18 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v13, v12, 4 +; RV64ZVE32F-NEXT: .LBB102_20: # %else18 ; RV64ZVE32F-NEXT: andi a2, a1, 1024 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB102_17 -; RV64ZVE32F-NEXT: # %bb.16: # %cond.store19 -; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: vslidedown.vi v13, v12, 2 +; RV64ZVE32F-NEXT: beqz a2, .LBB102_22 +; RV64ZVE32F-NEXT: # %bb.21: # %cond.store19 +; RV64ZVE32F-NEXT: vmv.x.s a2, v13 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 10 ; RV64ZVE32F-NEXT: vse8.v v14, (a2) -; RV64ZVE32F-NEXT: .LBB102_17: # %else20 +; RV64ZVE32F-NEXT: .LBB102_22: # %else20 ; RV64ZVE32F-NEXT: slli a2, a1, 52 -; RV64ZVE32F-NEXT: bgez a2, .LBB102_19 -; RV64ZVE32F-NEXT: # %bb.18: # %cond.store21 +; RV64ZVE32F-NEXT: bgez a2, .LBB102_24 +; RV64ZVE32F-NEXT: # %bb.23: # %cond.store21 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: vslidedown.vi v13, v13, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v13 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 11 -; RV64ZVE32F-NEXT: vse8.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB102_19: # %else22 +; RV64ZVE32F-NEXT: vslidedown.vi v13, v8, 11 +; RV64ZVE32F-NEXT: vse8.v v13, (a2) +; RV64ZVE32F-NEXT: .LBB102_24: # %else22 ; RV64ZVE32F-NEXT: slli a2, a1, 51 -; RV64ZVE32F-NEXT: vsetivli zero, 16, e8, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 16 -; RV64ZVE32F-NEXT: bgez a2, .LBB102_21 -; RV64ZVE32F-NEXT: # %bb.20: # %cond.store23 -; RV64ZVE32F-NEXT: vmv.x.s a2, v13 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 4 +; RV64ZVE32F-NEXT: bgez a2, .LBB102_26 +; RV64ZVE32F-NEXT: # %bb.25: # %cond.store23 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 12 -; RV64ZVE32F-NEXT: vse8.v v11, (a2) -; RV64ZVE32F-NEXT: .LBB102_21: # %else24 +; RV64ZVE32F-NEXT: vslidedown.vi v13, v8, 12 +; RV64ZVE32F-NEXT: vse8.v v13, (a2) +; RV64ZVE32F-NEXT: .LBB102_26: # %else24 ; RV64ZVE32F-NEXT: slli a2, a1, 50 -; RV64ZVE32F-NEXT: bgez a2, .LBB102_23 -; RV64ZVE32F-NEXT: # %bb.22: # %cond.store25 +; RV64ZVE32F-NEXT: bgez a2, .LBB102_28 +; RV64ZVE32F-NEXT: # %bb.27: # %cond.store25 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v13, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v13, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v13 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 13 -; RV64ZVE32F-NEXT: vse8.v v11, (a2) -; RV64ZVE32F-NEXT: .LBB102_23: # %else26 +; RV64ZVE32F-NEXT: vslidedown.vi v13, v8, 13 +; RV64ZVE32F-NEXT: vse8.v v13, (a2) +; RV64ZVE32F-NEXT: .LBB102_28: # %else26 ; RV64ZVE32F-NEXT: slli a2, a1, 49 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v13, 2 -; RV64ZVE32F-NEXT: bltz a2, .LBB102_54 -; RV64ZVE32F-NEXT: # %bb.24: # %else28 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 2 +; RV64ZVE32F-NEXT: bgez a2, .LBB102_30 +; RV64ZVE32F-NEXT: # %bb.29: # %cond.store27 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v13, v8, 14 +; RV64ZVE32F-NEXT: vse8.v v13, (a2) +; RV64ZVE32F-NEXT: .LBB102_30: # %else28 ; RV64ZVE32F-NEXT: slli a2, a1, 48 -; RV64ZVE32F-NEXT: bltz a2, .LBB102_55 -; RV64ZVE32F-NEXT: .LBB102_25: # %else30 +; RV64ZVE32F-NEXT: vsetivli zero, 16, e8, m2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 16 +; RV64ZVE32F-NEXT: bltz a2, .LBB102_63 +; RV64ZVE32F-NEXT: # %bb.31: # %else30 ; RV64ZVE32F-NEXT: slli a2, a1, 47 -; RV64ZVE32F-NEXT: bltz a2, .LBB102_56 -; RV64ZVE32F-NEXT: .LBB102_26: # %else32 +; RV64ZVE32F-NEXT: bltz a2, .LBB102_64 +; RV64ZVE32F-NEXT: .LBB102_32: # %else32 ; RV64ZVE32F-NEXT: slli a2, a1, 46 -; RV64ZVE32F-NEXT: bgez a2, .LBB102_28 -; RV64ZVE32F-NEXT: .LBB102_27: # %cond.store33 +; RV64ZVE32F-NEXT: bgez a2, .LBB102_34 +; RV64ZVE32F-NEXT: .LBB102_33: # %cond.store33 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v11 @@ -12596,32 +12342,46 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs, ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 17 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vse8.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB102_28: # %else34 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 +; RV64ZVE32F-NEXT: .LBB102_34: # %else34 ; RV64ZVE32F-NEXT: slli a2, a1, 45 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 2 -; RV64ZVE32F-NEXT: bltz a2, .LBB102_57 -; RV64ZVE32F-NEXT: # %bb.29: # %else36 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 +; RV64ZVE32F-NEXT: bgez a2, .LBB102_36 +; RV64ZVE32F-NEXT: # %bb.35: # %cond.store35 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 18 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64ZVE32F-NEXT: vse8.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB102_36: # %else36 ; RV64ZVE32F-NEXT: slli a2, a1, 44 -; RV64ZVE32F-NEXT: bltz a2, .LBB102_58 -; RV64ZVE32F-NEXT: .LBB102_30: # %else38 -; RV64ZVE32F-NEXT: slli a2, a1, 43 -; RV64ZVE32F-NEXT: bgez a2, .LBB102_32 -; RV64ZVE32F-NEXT: .LBB102_31: # %cond.store39 +; RV64ZVE32F-NEXT: bgez a2, .LBB102_38 +; RV64ZVE32F-NEXT: # %bb.37: # %cond.store37 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 19 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64ZVE32F-NEXT: vse8.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB102_38: # %else38 +; RV64ZVE32F-NEXT: slli a2, a1, 43 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 +; RV64ZVE32F-NEXT: bgez a2, .LBB102_40 +; RV64ZVE32F-NEXT: # %bb.39: # %cond.store39 ; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 20 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vse8.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB102_32: # %else40 +; RV64ZVE32F-NEXT: .LBB102_40: # %else40 ; RV64ZVE32F-NEXT: slli a2, a1, 42 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 8 -; RV64ZVE32F-NEXT: bgez a2, .LBB102_34 -; RV64ZVE32F-NEXT: # %bb.33: # %cond.store41 +; RV64ZVE32F-NEXT: bgez a2, .LBB102_42 +; RV64ZVE32F-NEXT: # %bb.41: # %cond.store41 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v11, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v12 @@ -12630,21 +12390,46 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs, ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 21 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vse8.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB102_34: # %else42 +; RV64ZVE32F-NEXT: .LBB102_42: # %else42 ; RV64ZVE32F-NEXT: slli a2, a1, 41 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 2 -; RV64ZVE32F-NEXT: bltz a2, .LBB102_59 -; RV64ZVE32F-NEXT: # %bb.35: # %else44 +; RV64ZVE32F-NEXT: bgez a2, .LBB102_44 +; RV64ZVE32F-NEXT: # %bb.43: # %cond.store43 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 22 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64ZVE32F-NEXT: vse8.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB102_44: # %else44 ; RV64ZVE32F-NEXT: slli a2, a1, 40 -; RV64ZVE32F-NEXT: bltz a2, .LBB102_60 -; RV64ZVE32F-NEXT: .LBB102_36: # %else46 +; RV64ZVE32F-NEXT: bgez a2, .LBB102_46 +; RV64ZVE32F-NEXT: # %bb.45: # %cond.store45 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 23 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64ZVE32F-NEXT: vse8.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB102_46: # %else46 ; RV64ZVE32F-NEXT: slli a2, a1, 39 -; RV64ZVE32F-NEXT: bltz a2, .LBB102_61 -; RV64ZVE32F-NEXT: .LBB102_37: # %else48 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 8 +; RV64ZVE32F-NEXT: bgez a2, .LBB102_48 +; RV64ZVE32F-NEXT: # %bb.47: # %cond.store47 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 24 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64ZVE32F-NEXT: vse8.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB102_48: # %else48 ; RV64ZVE32F-NEXT: slli a2, a1, 38 -; RV64ZVE32F-NEXT: bgez a2, .LBB102_39 -; RV64ZVE32F-NEXT: .LBB102_38: # %cond.store49 +; RV64ZVE32F-NEXT: bgez a2, .LBB102_50 +; RV64ZVE32F-NEXT: # %bb.49: # %cond.store49 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v11 @@ -12653,48 +12438,71 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs, ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 25 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vse8.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB102_39: # %else50 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 +; RV64ZVE32F-NEXT: .LBB102_50: # %else50 ; RV64ZVE32F-NEXT: slli a2, a1, 37 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 -; RV64ZVE32F-NEXT: bltz a2, .LBB102_62 -; RV64ZVE32F-NEXT: # %bb.40: # %else52 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 +; RV64ZVE32F-NEXT: bgez a2, .LBB102_52 +; RV64ZVE32F-NEXT: # %bb.51: # %cond.store51 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 26 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64ZVE32F-NEXT: vse8.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB102_52: # %else52 ; RV64ZVE32F-NEXT: slli a2, a1, 36 -; RV64ZVE32F-NEXT: bltz a2, .LBB102_63 -; RV64ZVE32F-NEXT: .LBB102_41: # %else54 +; RV64ZVE32F-NEXT: bgez a2, .LBB102_54 +; RV64ZVE32F-NEXT: # %bb.53: # %cond.store53 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 27 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64ZVE32F-NEXT: vse8.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB102_54: # %else54 ; RV64ZVE32F-NEXT: slli a2, a1, 35 -; RV64ZVE32F-NEXT: bltz a2, .LBB102_64 -; RV64ZVE32F-NEXT: .LBB102_42: # %else56 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 +; RV64ZVE32F-NEXT: bgez a2, .LBB102_56 +; RV64ZVE32F-NEXT: # %bb.55: # %cond.store55 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 28 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64ZVE32F-NEXT: vse8.v v12, (a2) +; RV64ZVE32F-NEXT: .LBB102_56: # %else56 ; RV64ZVE32F-NEXT: slli a2, a1, 34 -; RV64ZVE32F-NEXT: bgez a2, .LBB102_44 -; RV64ZVE32F-NEXT: .LBB102_43: # %cond.store57 +; RV64ZVE32F-NEXT: bgez a2, .LBB102_58 +; RV64ZVE32F-NEXT: # %bb.57: # %cond.store57 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 29 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vse8.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB102_44: # %else58 +; RV64ZVE32F-NEXT: .LBB102_58: # %else58 ; RV64ZVE32F-NEXT: slli a2, a1, 33 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2 -; RV64ZVE32F-NEXT: bgez a2, .LBB102_46 -; RV64ZVE32F-NEXT: # %bb.45: # %cond.store59 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: bgez a2, .LBB102_60 +; RV64ZVE32F-NEXT: # %bb.59: # %cond.store59 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 30 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vse8.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB102_46: # %else60 +; RV64ZVE32F-NEXT: .LBB102_60: # %else60 ; RV64ZVE32F-NEXT: lui a2, 524288 ; RV64ZVE32F-NEXT: and a1, a1, a2 -; RV64ZVE32F-NEXT: beqz a1, .LBB102_48 -; RV64ZVE32F-NEXT: # %bb.47: # %cond.store61 +; RV64ZVE32F-NEXT: beqz a1, .LBB102_62 +; RV64ZVE32F-NEXT: # %bb.61: # %cond.store61 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v10 @@ -12703,73 +12511,19 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs, ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 31 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vse8.v v8, (a0) -; RV64ZVE32F-NEXT: .LBB102_48: # %else62 +; RV64ZVE32F-NEXT: .LBB102_62: # %else62 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB102_49: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v12 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 2 -; RV64ZVE32F-NEXT: vse8.v v14, (a2) -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: beqz a2, .LBB102_6 -; RV64ZVE32F-NEXT: .LBB102_50: # %cond.store5 +; RV64ZVE32F-NEXT: .LBB102_63: # %cond.store29 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v12 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 3 -; RV64ZVE32F-NEXT: vse8.v v12, (a2) -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB102_7 -; RV64ZVE32F-NEXT: j .LBB102_8 -; RV64ZVE32F-NEXT: .LBB102_51: # %cond.store11 -; RV64ZVE32F-NEXT: vmv.x.s a2, v13 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 6 -; RV64ZVE32F-NEXT: vse8.v v14, (a2) -; RV64ZVE32F-NEXT: andi a2, a1, 128 -; RV64ZVE32F-NEXT: beqz a2, .LBB102_12 -; RV64ZVE32F-NEXT: .LBB102_52: # %cond.store13 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v13, v13, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v13 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v13, v8, 7 -; RV64ZVE32F-NEXT: vse8.v v13, (a2) -; RV64ZVE32F-NEXT: andi a2, a1, 256 -; RV64ZVE32F-NEXT: beqz a2, .LBB102_13 -; RV64ZVE32F-NEXT: .LBB102_53: # %cond.store15 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v12 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vslidedown.vi v13, v8, 8 -; RV64ZVE32F-NEXT: vse8.v v13, (a2) -; RV64ZVE32F-NEXT: andi a2, a1, 512 -; RV64ZVE32F-NEXT: bnez a2, .LBB102_14 -; RV64ZVE32F-NEXT: j .LBB102_15 -; RV64ZVE32F-NEXT: .LBB102_54: # %cond.store27 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 14 -; RV64ZVE32F-NEXT: vse8.v v12, (a2) -; RV64ZVE32F-NEXT: slli a2, a1, 48 -; RV64ZVE32F-NEXT: bgez a2, .LBB102_25 -; RV64ZVE32F-NEXT: .LBB102_55: # %cond.store29 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v12, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 15 ; RV64ZVE32F-NEXT: vse8.v v11, (a2) ; RV64ZVE32F-NEXT: slli a2, a1, 47 -; RV64ZVE32F-NEXT: bgez a2, .LBB102_26 -; RV64ZVE32F-NEXT: .LBB102_56: # %cond.store31 +; RV64ZVE32F-NEXT: bgez a2, .LBB102_32 +; RV64ZVE32F-NEXT: .LBB102_64: # %cond.store31 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -12777,89 +12531,8 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs, ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vse8.v v12, (a2) ; RV64ZVE32F-NEXT: slli a2, a1, 46 -; RV64ZVE32F-NEXT: bltz a2, .LBB102_27 -; RV64ZVE32F-NEXT: j .LBB102_28 -; RV64ZVE32F-NEXT: .LBB102_57: # %cond.store35 -; RV64ZVE32F-NEXT: vmv.x.s a2, v12 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 18 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vse8.v v14, (a2) -; RV64ZVE32F-NEXT: slli a2, a1, 44 -; RV64ZVE32F-NEXT: bgez a2, .LBB102_30 -; RV64ZVE32F-NEXT: .LBB102_58: # %cond.store37 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v12 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 19 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vse8.v v12, (a2) -; RV64ZVE32F-NEXT: slli a2, a1, 43 -; RV64ZVE32F-NEXT: bltz a2, .LBB102_31 -; RV64ZVE32F-NEXT: j .LBB102_32 -; RV64ZVE32F-NEXT: .LBB102_59: # %cond.store43 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 22 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vse8.v v12, (a2) -; RV64ZVE32F-NEXT: slli a2, a1, 40 -; RV64ZVE32F-NEXT: bgez a2, .LBB102_36 -; RV64ZVE32F-NEXT: .LBB102_60: # %cond.store45 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 23 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vse8.v v12, (a2) -; RV64ZVE32F-NEXT: slli a2, a1, 39 -; RV64ZVE32F-NEXT: bgez a2, .LBB102_37 -; RV64ZVE32F-NEXT: .LBB102_61: # %cond.store47 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 24 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vse8.v v12, (a2) -; RV64ZVE32F-NEXT: slli a2, a1, 38 -; RV64ZVE32F-NEXT: bltz a2, .LBB102_38 -; RV64ZVE32F-NEXT: j .LBB102_39 -; RV64ZVE32F-NEXT: .LBB102_62: # %cond.store51 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 26 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vse8.v v12, (a2) -; RV64ZVE32F-NEXT: slli a2, a1, 36 -; RV64ZVE32F-NEXT: bgez a2, .LBB102_41 -; RV64ZVE32F-NEXT: .LBB102_63: # %cond.store53 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 27 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vse8.v v12, (a2) -; RV64ZVE32F-NEXT: slli a2, a1, 35 -; RV64ZVE32F-NEXT: bgez a2, .LBB102_42 -; RV64ZVE32F-NEXT: .LBB102_64: # %cond.store55 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 28 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vse8.v v12, (a2) -; RV64ZVE32F-NEXT: slli a2, a1, 34 -; RV64ZVE32F-NEXT: bltz a2, .LBB102_43 -; RV64ZVE32F-NEXT: j .LBB102_44 +; RV64ZVE32F-NEXT: bltz a2, .LBB102_33 +; RV64ZVE32F-NEXT: j .LBB102_34 %ptrs = getelementptr inbounds i8, ptr %base, <32 x i8> %idxs call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> %val, <32 x ptr> %ptrs, i32 1, <32 x i1> %m) ret void diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll index 47423b33975c8..d3a36525115c8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll @@ -1516,17 +1516,17 @@ define i64 @vreduce_add_v64i64(ptr %x) nounwind { define i64 @vwreduce_add_v64i64(ptr %x) { ; RV32-LABEL: vwreduce_add_v64i64: ; RV32: # %bb.0: -; RV32-NEXT: addi a1, a0, 128 -; RV32-NEXT: li a2, 32 -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: addi a0, a0, 128 ; RV32-NEXT: vle32.v v16, (a0) -; RV32-NEXT: vle32.v v8, (a1) ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV32-NEXT: vslidedown.vi v24, v16, 16 +; RV32-NEXT: vslidedown.vi v24, v8, 16 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vwadd.vv v0, v16, v8 +; RV32-NEXT: vwadd.vv v0, v8, v16 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV32-NEXT: vslidedown.vi v8, v8, 16 +; RV32-NEXT: vslidedown.vi v8, v16, 16 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vwadd.vv v16, v24, v8 ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma @@ -1535,7 +1535,7 @@ define i64 @vwreduce_add_v64i64(ptr %x) { ; RV32-NEXT: vredsum.vs v8, v8, v16 ; RV32-NEXT: vmv.x.s a0, v8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV32-NEXT: vsrl.vx v8, v8, a2 +; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -1547,11 +1547,11 @@ define i64 @vwreduce_add_v64i64(ptr %x) { ; RV64-NEXT: slli a1, a1, 3 ; RV64-NEXT: sub sp, sp, a1 ; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; RV64-NEXT: addi a1, a0, 128 -; RV64-NEXT: li a2, 32 -; RV64-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; RV64-NEXT: li a1, 32 +; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; RV64-NEXT: vle32.v v8, (a0) -; RV64-NEXT: vle32.v v16, (a1) +; RV64-NEXT: addi a0, a0, 128 +; RV64-NEXT: vle32.v v16, (a0) ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma ; RV64-NEXT: vslidedown.vi v24, v8, 16 ; RV64-NEXT: addi a0, sp, 16 @@ -1582,17 +1582,17 @@ define i64 @vwreduce_add_v64i64(ptr %x) { define i64 @vwreduce_uadd_v64i64(ptr %x) { ; RV32-LABEL: vwreduce_uadd_v64i64: ; RV32: # %bb.0: -; RV32-NEXT: addi a1, a0, 128 -; RV32-NEXT: li a2, 32 -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: addi a0, a0, 128 ; RV32-NEXT: vle32.v v16, (a0) -; RV32-NEXT: vle32.v v8, (a1) ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV32-NEXT: vslidedown.vi v24, v16, 16 +; RV32-NEXT: vslidedown.vi v24, v8, 16 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vwaddu.vv v0, v16, v8 +; RV32-NEXT: vwaddu.vv v0, v8, v16 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV32-NEXT: vslidedown.vi v8, v8, 16 +; RV32-NEXT: vslidedown.vi v8, v16, 16 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vwaddu.vv v16, v24, v8 ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma @@ -1601,7 +1601,7 @@ define i64 @vwreduce_uadd_v64i64(ptr %x) { ; RV32-NEXT: vredsum.vs v8, v8, v16 ; RV32-NEXT: vmv.x.s a0, v8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV32-NEXT: vsrl.vx v8, v8, a2 +; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -1613,11 +1613,11 @@ define i64 @vwreduce_uadd_v64i64(ptr %x) { ; RV64-NEXT: slli a1, a1, 3 ; RV64-NEXT: sub sp, sp, a1 ; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; RV64-NEXT: addi a1, a0, 128 -; RV64-NEXT: li a2, 32 -; RV64-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; RV64-NEXT: li a1, 32 +; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; RV64-NEXT: vle32.v v8, (a0) -; RV64-NEXT: vle32.v v16, (a1) +; RV64-NEXT: addi a0, a0, 128 +; RV64-NEXT: vle32.v v16, (a0) ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma ; RV64-NEXT: vslidedown.vi v24, v8, 16 ; RV64-NEXT: addi a0, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll index 0b4231cedcab5..c11319ff335fd 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll @@ -143,16 +143,15 @@ define void @deinterleave6_0_i8(ptr %in, ptr %out) { ; CHECK-LABEL: deinterleave6_0_i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle8.v v9, (a0) ; CHECK-NEXT: vmv.v.i v0, 2 -; CHECK-NEXT: vmv.v.v v9, v8 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vslidedown.vi v9, v8, 5, v0.t -; CHECK-NEXT: vmv.v.i v0, 4 +; CHECK-NEXT: vmv.v.i v8, 4 ; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v8, 8 +; CHECK-NEXT: vslidedown.vi v10, v9, 8 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vrgather.vi v9, v8, 4, v0.t +; CHECK-NEXT: vslidedown.vi v9, v9, 5, v0.t +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vrgather.vi v9, v10, 4, v0.t ; CHECK-NEXT: vse8.v v9, (a1) ; CHECK-NEXT: ret entry: @@ -189,16 +188,15 @@ define void @deinterleave7_0_i8(ptr %in, ptr %out) { ; CHECK-LABEL: deinterleave7_0_i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle8.v v9, (a0) ; CHECK-NEXT: vmv.v.i v0, 2 -; CHECK-NEXT: vmv.v.v v9, v8 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vslidedown.vi v9, v8, 6, v0.t -; CHECK-NEXT: vmv.v.i v0, 4 +; CHECK-NEXT: vmv.v.i v8, 4 ; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v8, 8 +; CHECK-NEXT: vslidedown.vi v10, v9, 8 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vrgather.vi v9, v8, 6, v0.t +; CHECK-NEXT: vslidedown.vi v9, v9, 6, v0.t +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vrgather.vi v9, v10, 6, v0.t ; CHECK-NEXT: vse8.v v9, (a1) ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwadd.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwadd.ll index c9ce63b029b2f..ebb920f0ac42e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwadd.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwadd.ll @@ -91,36 +91,26 @@ define <64 x float> @vfwadd_v64f16(ptr %x, ptr %y) { ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 4 +; CHECK-NEXT: slli a2, a2, 3 ; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vle16.v v0, (a1) +; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: vle16.v v24, (a1) ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v16, v8, a0 -; CHECK-NEXT: vslidedown.vx v8, v0, a0 +; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vslidedown.vx v0, v24, a0 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vfwadd.vv v24, v16, v8 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vfwadd.vv v8, v16, v24 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vfwadd.vv v8, v16, v0 +; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vfwadd.vv v16, v24, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -204,35 +194,24 @@ define <32 x double> @vfwadd_v32f32(ptr %x, ptr %y) { ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 4 +; CHECK-NEXT: slli a2, a2, 3 ; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vle32.v v16, (a0) +; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; CHECK-NEXT: vslidedown.vi v8, v16, 16 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vle32.v v0, (a1) -; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v8, 16 -; CHECK-NEXT: vslidedown.vi v8, v0, 16 +; CHECK-NEXT: vslidedown.vi v0, v24, 16 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vfwadd.vv v24, v16, v8 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vfwadd.vv v8, v16, v0 +; CHECK-NEXT: vfwadd.vv v8, v16, v24 +; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vfwadd.vv v16, v24, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -396,12 +375,12 @@ define <32 x double> @vfwadd_vf_v32f32(ptr %x, float %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v24, (a0) +; CHECK-NEXT: vle32.v v16, (a0) ; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v24, 16 +; CHECK-NEXT: vslidedown.vi v24, v16, 16 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vfwadd.vf v16, v8, fa0 -; CHECK-NEXT: vfwadd.vf v8, v24, fa0 +; CHECK-NEXT: vfwadd.vf v8, v16, fa0 +; CHECK-NEXT: vfwadd.vf v16, v24, fa0 ; CHECK-NEXT: ret %a = load <32 x float>, ptr %x %b = insertelement <32 x float> poison, float %y, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmul.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmul.ll index cf0df1e77411c..47ac1c1a88df4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmul.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmul.ll @@ -91,36 +91,26 @@ define <64 x float> @vfwmul_v64f16(ptr %x, ptr %y) { ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 4 +; CHECK-NEXT: slli a2, a2, 3 ; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vle16.v v0, (a1) +; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: vle16.v v24, (a1) ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v16, v8, a0 -; CHECK-NEXT: vslidedown.vx v8, v0, a0 +; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vslidedown.vx v0, v24, a0 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vfwmul.vv v24, v16, v8 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vfwmul.vv v8, v16, v24 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vfwmul.vv v8, v16, v0 +; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vfwmul.vv v16, v24, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -204,35 +194,24 @@ define <32 x double> @vfwmul_v32f32(ptr %x, ptr %y) { ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 4 +; CHECK-NEXT: slli a2, a2, 3 ; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vle32.v v16, (a0) +; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; CHECK-NEXT: vslidedown.vi v8, v16, 16 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vle32.v v0, (a1) -; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v8, 16 -; CHECK-NEXT: vslidedown.vi v8, v0, 16 +; CHECK-NEXT: vslidedown.vi v0, v24, 16 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vfwmul.vv v24, v16, v8 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vfwmul.vv v8, v16, v0 +; CHECK-NEXT: vfwmul.vv v8, v16, v24 +; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vfwmul.vv v16, v24, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -396,12 +375,12 @@ define <32 x double> @vfwmul_vf_v32f32(ptr %x, float %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v24, (a0) +; CHECK-NEXT: vle32.v v16, (a0) ; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v24, 16 +; CHECK-NEXT: vslidedown.vi v24, v16, 16 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vfwmul.vf v16, v8, fa0 -; CHECK-NEXT: vfwmul.vf v8, v24, fa0 +; CHECK-NEXT: vfwmul.vf v8, v16, fa0 +; CHECK-NEXT: vfwmul.vf v16, v24, fa0 ; CHECK-NEXT: ret %a = load <32 x float>, ptr %x %b = insertelement <32 x float> poison, float %y, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwsub.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwsub.ll index 8a94fa749f48f..25f6b5ab27411 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwsub.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwsub.ll @@ -91,36 +91,26 @@ define <64 x float> @vfwsub_v64f16(ptr %x, ptr %y) { ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 4 +; CHECK-NEXT: slli a2, a2, 3 ; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vle16.v v0, (a1) +; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: vle16.v v24, (a1) ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v16, v8, a0 -; CHECK-NEXT: vslidedown.vx v8, v0, a0 +; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vslidedown.vx v0, v24, a0 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vfwsub.vv v24, v16, v8 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vfwsub.vv v8, v16, v24 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vfwsub.vv v8, v16, v0 +; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vfwsub.vv v16, v24, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -204,35 +194,24 @@ define <32 x double> @vfwsub_v32f32(ptr %x, ptr %y) { ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 4 +; CHECK-NEXT: slli a2, a2, 3 ; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vle32.v v16, (a0) +; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; CHECK-NEXT: vslidedown.vi v8, v16, 16 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vle32.v v0, (a1) -; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v8, 16 -; CHECK-NEXT: vslidedown.vi v8, v0, 16 +; CHECK-NEXT: vslidedown.vi v0, v24, 16 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vfwsub.vv v24, v16, v8 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vfwsub.vv v8, v16, v0 +; CHECK-NEXT: vfwsub.vv v8, v16, v24 +; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vfwsub.vv v16, v24, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -396,12 +375,12 @@ define <32 x double> @vfwsub_vf_v32f32(ptr %x, float %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v24, (a0) +; CHECK-NEXT: vle32.v v16, (a0) ; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v24, 16 +; CHECK-NEXT: vslidedown.vi v24, v16, 16 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vfwsub.vf v16, v8, fa0 -; CHECK-NEXT: vfwsub.vf v8, v24, fa0 +; CHECK-NEXT: vfwsub.vf v8, v16, fa0 +; CHECK-NEXT: vfwsub.vf v16, v24, fa0 ; CHECK-NEXT: ret %a = load <32 x float>, ptr %x %b = insertelement <32 x float> poison, float %y, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll index 83a195a66a502..352666de57881 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll @@ -2613,31 +2613,30 @@ define <32 x double> @vpgather_baseidx_v32f64(ptr %base, <32 x i64> %idxs, <32 x ; RV32-LABEL: vpgather_baseidx_v32f64: ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmv1r.v v7, v0 ; RV32-NEXT: vnsrl.wi v24, v16, 0 ; RV32-NEXT: vnsrl.wi v16, v8, 0 ; RV32-NEXT: li a2, 32 -; RV32-NEXT: addi a3, a1, -16 -; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v0, v0, 2 ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vslideup.vi v16, v24, 16 -; RV32-NEXT: vsll.vi v24, v16, 3 -; RV32-NEXT: sltu a2, a1, a3 -; RV32-NEXT: addi a2, a2, -1 -; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV32-NEXT: vslidedown.vi v8, v24, 16 -; RV32-NEXT: and a2, a2, a3 -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vluxei32.v v16, (a0), v8, v0.t -; RV32-NEXT: li a2, 16 -; RV32-NEXT: bltu a1, a2, .LBB104_2 +; RV32-NEXT: li a3, 16 +; RV32-NEXT: vsll.vi v16, v16, 3 +; RV32-NEXT: mv a2, a1 +; RV32-NEXT: bltu a1, a3, .LBB104_2 ; RV32-NEXT: # %bb.1: -; RV32-NEXT: li a1, 16 +; RV32-NEXT: li a2, 16 ; RV32-NEXT: .LBB104_2: -; RV32-NEXT: vmv1r.v v0, v7 +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t +; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; RV32-NEXT: vslidedown.vi v24, v16, 16 +; RV32-NEXT: addi a2, a1, -16 +; RV32-NEXT: sltu a1, a1, a2 +; RV32-NEXT: addi a1, a1, -1 +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32-NEXT: vslidedown.vi v0, v0, 2 +; RV32-NEXT: and a1, a1, a2 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vluxei32.v v8, (a0), v24, v0.t +; RV32-NEXT: vluxei32.v v16, (a0), v24, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vpgather_baseidx_v32f64: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll index f73d41a4d5a11..9997646dce1a1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll @@ -250,35 +250,25 @@ define <128 x i16> @vwadd_v128i16(ptr %x, ptr %y) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 4 +; CHECK-NEXT: slli a2, a2, 3 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: li a2, 128 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vle8.v v0, (a1) +; CHECK-NEXT: vle8.v v16, (a0) +; CHECK-NEXT: vle8.v v24, (a1) ; CHECK-NEXT: li a0, 64 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v16, v8, a0 -; CHECK-NEXT: vslidedown.vx v8, v0, a0 +; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vslidedown.vx v0, v24, a0 ; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma -; CHECK-NEXT: vwadd.vv v24, v16, v8 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vwadd.vv v8, v16, v24 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vwadd.vv v8, v16, v0 +; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vwadd.vv v16, v24, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -295,35 +285,25 @@ define <64 x i32> @vwadd_v64i32(ptr %x, ptr %y) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 4 +; CHECK-NEXT: slli a2, a2, 3 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vle16.v v0, (a1) +; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: vle16.v v24, (a1) ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v16, v8, a0 -; CHECK-NEXT: vslidedown.vx v8, v0, a0 +; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vslidedown.vx v0, v24, a0 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vwadd.vv v24, v16, v8 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vwadd.vv v8, v16, v24 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vwadd.vv v8, v16, v0 +; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vwadd.vv v16, v24, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -340,34 +320,23 @@ define <32 x i64> @vwadd_v32i64(ptr %x, ptr %y) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 4 +; CHECK-NEXT: slli a2, a2, 3 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vle32.v v16, (a0) +; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; CHECK-NEXT: vslidedown.vi v8, v16, 16 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vle32.v v0, (a1) -; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v8, 16 -; CHECK-NEXT: vslidedown.vi v8, v0, 16 +; CHECK-NEXT: vslidedown.vi v0, v24, 16 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vwadd.vv v24, v16, v8 +; CHECK-NEXT: vwadd.vv v8, v16, v24 +; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vwadd.vv v16, v24, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vwadd.vv v8, v16, v0 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll index 721ea111a51d1..1a716f688dd59 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll @@ -250,35 +250,25 @@ define <128 x i16> @vwaddu_v128i16(ptr %x, ptr %y) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 4 +; CHECK-NEXT: slli a2, a2, 3 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: li a2, 128 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vle8.v v0, (a1) +; CHECK-NEXT: vle8.v v16, (a0) +; CHECK-NEXT: vle8.v v24, (a1) ; CHECK-NEXT: li a0, 64 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v16, v8, a0 -; CHECK-NEXT: vslidedown.vx v8, v0, a0 +; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vslidedown.vx v0, v24, a0 ; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma -; CHECK-NEXT: vwaddu.vv v24, v16, v8 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vwaddu.vv v8, v16, v24 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vwaddu.vv v8, v16, v0 +; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vwaddu.vv v16, v24, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -295,35 +285,25 @@ define <64 x i32> @vwaddu_v64i32(ptr %x, ptr %y) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 4 +; CHECK-NEXT: slli a2, a2, 3 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vle16.v v0, (a1) +; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: vle16.v v24, (a1) ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v16, v8, a0 -; CHECK-NEXT: vslidedown.vx v8, v0, a0 +; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vslidedown.vx v0, v24, a0 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vwaddu.vv v24, v16, v8 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vwaddu.vv v8, v16, v24 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vwaddu.vv v8, v16, v0 +; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vwaddu.vv v16, v24, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -340,34 +320,23 @@ define <32 x i64> @vwaddu_v32i64(ptr %x, ptr %y) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 4 +; CHECK-NEXT: slli a2, a2, 3 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vle32.v v16, (a0) +; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; CHECK-NEXT: vslidedown.vi v8, v16, 16 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vle32.v v0, (a1) -; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v8, 16 -; CHECK-NEXT: vslidedown.vi v8, v0, 16 +; CHECK-NEXT: vslidedown.vi v0, v24, 16 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vwaddu.vv v24, v16, v8 +; CHECK-NEXT: vwaddu.vv v8, v16, v24 +; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vwaddu.vv v16, v24, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vwaddu.vv v8, v16, v0 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll index 835edfba32d8e..94c3138fd330b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll @@ -275,36 +275,26 @@ define <128 x i16> @vwmul_v128i16(ptr %x, ptr %y) { ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 4 +; CHECK-NEXT: slli a2, a2, 3 ; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: li a2, 128 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vle8.v v0, (a1) +; CHECK-NEXT: vle8.v v16, (a0) +; CHECK-NEXT: vle8.v v24, (a1) ; CHECK-NEXT: li a0, 64 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v16, v8, a0 -; CHECK-NEXT: vslidedown.vx v8, v0, a0 +; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vslidedown.vx v0, v24, a0 ; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma -; CHECK-NEXT: vwmul.vv v24, v16, v8 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vwmul.vv v8, v16, v24 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vwmul.vv v8, v16, v0 +; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vwmul.vv v16, v24, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -324,36 +314,26 @@ define <64 x i32> @vwmul_v64i32(ptr %x, ptr %y) { ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 4 +; CHECK-NEXT: slli a2, a2, 3 ; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vle16.v v0, (a1) +; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: vle16.v v24, (a1) ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v16, v8, a0 -; CHECK-NEXT: vslidedown.vx v8, v0, a0 +; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vslidedown.vx v0, v24, a0 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vwmul.vv v24, v16, v8 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vwmul.vv v8, v16, v24 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vwmul.vv v8, v16, v0 +; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vwmul.vv v16, v24, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -373,35 +353,24 @@ define <32 x i64> @vwmul_v32i64(ptr %x, ptr %y) { ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 4 +; CHECK-NEXT: slli a2, a2, 3 ; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vle32.v v16, (a0) +; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; CHECK-NEXT: vslidedown.vi v8, v16, 16 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vle32.v v0, (a1) -; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v8, 16 -; CHECK-NEXT: vslidedown.vi v8, v0, 16 +; CHECK-NEXT: vslidedown.vi v0, v24, 16 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vwmul.vv v24, v16, v8 +; CHECK-NEXT: vwmul.vv v8, v16, v24 +; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vwmul.vv v16, v24, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vwmul.vv v8, v16, v0 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll index fd6acbf469dac..8ebd93e9dc637 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll @@ -267,36 +267,26 @@ define <128 x i16> @vwmulsu_v128i16(ptr %x, ptr %y) { ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 4 +; CHECK-NEXT: slli a2, a2, 3 ; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: li a2, 128 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vle8.v v0, (a1) +; CHECK-NEXT: vle8.v v16, (a0) +; CHECK-NEXT: vle8.v v24, (a1) ; CHECK-NEXT: li a0, 64 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v16, v8, a0 -; CHECK-NEXT: vslidedown.vx v8, v0, a0 +; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vslidedown.vx v0, v24, a0 ; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma -; CHECK-NEXT: vwmulsu.vv v24, v8, v16 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vwmulsu.vv v8, v24, v16 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vwmulsu.vv v8, v0, v16 +; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vwmulsu.vv v16, v0, v24 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -316,36 +306,26 @@ define <64 x i32> @vwmulsu_v64i32(ptr %x, ptr %y) { ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 4 +; CHECK-NEXT: slli a2, a2, 3 ; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vle16.v v0, (a1) +; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: vle16.v v24, (a1) ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v16, v8, a0 -; CHECK-NEXT: vslidedown.vx v8, v0, a0 +; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vslidedown.vx v0, v24, a0 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vwmulsu.vv v24, v8, v16 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vwmulsu.vv v8, v24, v16 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vwmulsu.vv v8, v0, v16 +; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vwmulsu.vv v16, v0, v24 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -365,35 +345,24 @@ define <32 x i64> @vwmulsu_v32i64(ptr %x, ptr %y) { ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 4 +; CHECK-NEXT: slli a2, a2, 3 ; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vle32.v v16, (a0) +; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; CHECK-NEXT: vslidedown.vi v8, v16, 16 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vle32.v v0, (a1) -; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v8, 16 -; CHECK-NEXT: vslidedown.vi v8, v0, 16 +; CHECK-NEXT: vslidedown.vi v0, v24, 16 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vwmulsu.vv v24, v8, v16 +; CHECK-NEXT: vwmulsu.vv v8, v24, v16 +; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vwmulsu.vv v16, v0, v24 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vwmulsu.vv v8, v0, v16 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll index e2642882be7ca..90e9ffdcb320a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll @@ -251,36 +251,26 @@ define <128 x i16> @vwmulu_v128i16(ptr %x, ptr %y) { ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 4 +; CHECK-NEXT: slli a2, a2, 3 ; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: li a2, 128 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vle8.v v0, (a1) +; CHECK-NEXT: vle8.v v16, (a0) +; CHECK-NEXT: vle8.v v24, (a1) ; CHECK-NEXT: li a0, 64 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v16, v8, a0 -; CHECK-NEXT: vslidedown.vx v8, v0, a0 +; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vslidedown.vx v0, v24, a0 ; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma -; CHECK-NEXT: vwmulu.vv v24, v16, v8 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vwmulu.vv v8, v16, v24 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vwmulu.vv v8, v16, v0 +; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vwmulu.vv v16, v24, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -300,36 +290,26 @@ define <64 x i32> @vwmulu_v64i32(ptr %x, ptr %y) { ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 4 +; CHECK-NEXT: slli a2, a2, 3 ; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vle16.v v0, (a1) +; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: vle16.v v24, (a1) ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v16, v8, a0 -; CHECK-NEXT: vslidedown.vx v8, v0, a0 +; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vslidedown.vx v0, v24, a0 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vwmulu.vv v24, v16, v8 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vwmulu.vv v8, v16, v24 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vwmulu.vv v8, v16, v0 +; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vwmulu.vv v16, v24, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -349,35 +329,24 @@ define <32 x i64> @vwmulu_v32i64(ptr %x, ptr %y) { ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 4 +; CHECK-NEXT: slli a2, a2, 3 ; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vle32.v v16, (a0) +; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; CHECK-NEXT: vslidedown.vi v8, v16, 16 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vle32.v v0, (a1) -; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v8, 16 -; CHECK-NEXT: vslidedown.vi v8, v0, 16 +; CHECK-NEXT: vslidedown.vi v0, v24, 16 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vwmulu.vv v24, v16, v8 +; CHECK-NEXT: vwmulu.vv v8, v16, v24 +; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vwmulu.vv v16, v24, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vwmulu.vv v8, v16, v0 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll index c884ea483879f..783de24100613 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll @@ -250,35 +250,25 @@ define <128 x i16> @vwsub_v128i16(ptr %x, ptr %y) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 4 +; CHECK-NEXT: slli a2, a2, 3 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: li a2, 128 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vle8.v v0, (a1) +; CHECK-NEXT: vle8.v v16, (a0) +; CHECK-NEXT: vle8.v v24, (a1) ; CHECK-NEXT: li a0, 64 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v16, v8, a0 -; CHECK-NEXT: vslidedown.vx v8, v0, a0 +; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vslidedown.vx v0, v24, a0 ; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma -; CHECK-NEXT: vwsub.vv v24, v16, v8 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vwsub.vv v8, v16, v24 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vwsub.vv v8, v16, v0 +; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vwsub.vv v16, v24, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -295,35 +285,25 @@ define <64 x i32> @vwsub_v64i32(ptr %x, ptr %y) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 4 +; CHECK-NEXT: slli a2, a2, 3 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vle16.v v0, (a1) +; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: vle16.v v24, (a1) ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v16, v8, a0 -; CHECK-NEXT: vslidedown.vx v8, v0, a0 +; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vslidedown.vx v0, v24, a0 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vwsub.vv v24, v16, v8 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vwsub.vv v8, v16, v24 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vwsub.vv v8, v16, v0 +; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vwsub.vv v16, v24, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -340,34 +320,23 @@ define <32 x i64> @vwsub_v32i64(ptr %x, ptr %y) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 4 +; CHECK-NEXT: slli a2, a2, 3 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vle32.v v16, (a0) +; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; CHECK-NEXT: vslidedown.vi v8, v16, 16 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vle32.v v0, (a1) -; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v8, 16 -; CHECK-NEXT: vslidedown.vi v8, v0, 16 +; CHECK-NEXT: vslidedown.vi v0, v24, 16 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vwsub.vv v24, v16, v8 +; CHECK-NEXT: vwsub.vv v8, v16, v24 +; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vwsub.vv v16, v24, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vwsub.vv v8, v16, v0 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll index a6947300e915c..bfdda47cc819e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll @@ -250,35 +250,25 @@ define <128 x i16> @vwsubu_v128i16(ptr %x, ptr %y) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 4 +; CHECK-NEXT: slli a2, a2, 3 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: li a2, 128 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vle8.v v0, (a1) +; CHECK-NEXT: vle8.v v16, (a0) +; CHECK-NEXT: vle8.v v24, (a1) ; CHECK-NEXT: li a0, 64 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v16, v8, a0 -; CHECK-NEXT: vslidedown.vx v8, v0, a0 +; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vslidedown.vx v0, v24, a0 ; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma -; CHECK-NEXT: vwsubu.vv v24, v16, v8 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vwsubu.vv v8, v16, v24 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vwsubu.vv v8, v16, v0 +; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vwsubu.vv v16, v24, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -295,35 +285,25 @@ define <64 x i32> @vwsubu_v64i32(ptr %x, ptr %y) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 4 +; CHECK-NEXT: slli a2, a2, 3 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vle16.v v0, (a1) +; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: vle16.v v24, (a1) ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v16, v8, a0 -; CHECK-NEXT: vslidedown.vx v8, v0, a0 +; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vslidedown.vx v0, v24, a0 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vwsubu.vv v24, v16, v8 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vwsubu.vv v8, v16, v24 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vwsubu.vv v8, v16, v0 +; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vwsubu.vv v16, v24, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -340,34 +320,23 @@ define <32 x i64> @vwsubu_v32i64(ptr %x, ptr %y) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 4 +; CHECK-NEXT: slli a2, a2, 3 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vle32.v v16, (a0) +; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; CHECK-NEXT: vslidedown.vi v8, v16, 16 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vle32.v v0, (a1) -; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v8, 16 -; CHECK-NEXT: vslidedown.vi v8, v0, 16 +; CHECK-NEXT: vslidedown.vi v0, v24, 16 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vwsubu.vv v24, v16, v8 +; CHECK-NEXT: vwsubu.vv v8, v16, v24 +; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vwsubu.vv v16, v24, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vwsubu.vv v8, v16, v0 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret