diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp index e35ffaf2b3935..7cfbe0d760fb6 100644 --- a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp @@ -229,6 +229,10 @@ void RISCVSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, // Spilling is generally expensive on all RISC-V cores, so always enable // register-pressure tracking. This will increase compile time. Policy.ShouldTrackPressure = true; + + // Enabling ShouldTrackLaneMasks when vector instructions are supported. + // TODO: Add extensions that need register pairs as well? + Policy.ShouldTrackLaneMasks = hasVInstructions(); } void RISCVSubtarget::overridePostRASchedPolicy( diff --git a/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll b/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll index 85867a4ab2c6f..d6d704829a197 100644 --- a/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll +++ b/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll @@ -24,23 +24,30 @@ define void @_Z3foov() { ; CHECK-NEXT: lui a0, %hi(.L__const._Z3foov.var_49) ; CHECK-NEXT: addi a0, a0, %lo(.L__const._Z3foov.var_49) ; CHECK-NEXT: vsetivli zero, 2, e16, m2, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v10, (a0) ; CHECK-NEXT: lui a0, %hi(.L__const._Z3foov.var_48) ; CHECK-NEXT: addi a0, a0, %lo(.L__const._Z3foov.var_48) -; CHECK-NEXT: vle8.v v10, (a0) +; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs1r.v v10, (a0) # vscale x 8-byte Folded Spill +; CHECK-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill ; CHECK-NEXT: lui a0, %hi(.L__const._Z3foov.var_46) ; CHECK-NEXT: addi a0, a0, %lo(.L__const._Z3foov.var_46) -; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vle16.v v12, (a0) ; CHECK-NEXT: lui a0, %hi(.L__const._Z3foov.var_45) ; CHECK-NEXT: addi a0, a0, %lo(.L__const._Z3foov.var_45) -; CHECK-NEXT: vle16.v v12, (a0) +; CHECK-NEXT: vle16.v v14, (a0) ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vs2r.v v10, (a0) # vscale x 16-byte Folded Spill +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a2, a1, 1 +; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: vs4r.v v12, (a0) # vscale x 32-byte Folded Spill +; CHECK-NEXT: slli a1, a1, 2 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vs2r.v v16, (a0) # vscale x 16-byte Folded Spill ; CHECK-NEXT: lui a0, %hi(.L__const._Z3foov.var_40) ; CHECK-NEXT: addi a0, a0, %lo(.L__const._Z3foov.var_40) ; CHECK-NEXT: #APP @@ -50,19 +57,18 @@ define void @_Z3foov() { ; CHECK-NEXT: lui a0, 1048572 ; CHECK-NEXT: addi a0, a0, 928 ; CHECK-NEXT: vmsbc.vx v0, v8, a0 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: lui a0, %hi(.L__const._Z3foov.var_44) +; CHECK-NEXT: addi a0, a0, %lo(.L__const._Z3foov.var_44) +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vle16.v v14, (a0) ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl1r.v v14, (a0) # vscale x 8-byte Folded Reload +; CHECK-NEXT: vl1r.v v16, (a0) # vscale x 8-byte Folded Reload ; CHECK-NEXT: vsetvli zero, zero, e16, m2, tu, mu -; CHECK-NEXT: vsext.vf2 v8, v14, v0.t -; CHECK-NEXT: lui a0, %hi(.L__const._Z3foov.var_44) -; CHECK-NEXT: addi a0, a0, %lo(.L__const._Z3foov.var_44) -; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vle16.v v14, (a0) +; CHECK-NEXT: vsext.vf2 v8, v16, v0.t ; CHECK-NEXT: lui a0, %hi(var_47) ; CHECK-NEXT: addi a0, a0, %lo(var_47) ; CHECK-NEXT: vsseg4e16.v v8, (a0) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-i8-index-cornercase.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-i8-index-cornercase.ll index 7a337aa253805..a35cf639f0e26 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-i8-index-cornercase.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-i8-index-cornercase.ll @@ -16,33 +16,33 @@ define <512 x i8> @single_source(<512 x i8> %a) { ; CHECK-NEXT: addi s0, sp, 1536 ; CHECK-NEXT: .cfi_def_cfa s0, 0 ; CHECK-NEXT: andi sp, sp, -512 -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; CHECK-NEXT: vmv8r.v v16, v8 ; CHECK-NEXT: li a0, 512 ; CHECK-NEXT: addi a1, sp, 512 -; CHECK-NEXT: vmv.x.s a2, v16 -; CHECK-NEXT: vslidedown.vi v24, v16, 5 +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv.x.s a2, v8 +; CHECK-NEXT: vslidedown.vi v24, v8, 5 ; CHECK-NEXT: li a3, 432 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma ; CHECK-NEXT: vse8.v v8, (a1) -; CHECK-NEXT: vmv.v.x v8, a2 -; CHECK-NEXT: lbu a0, 770(sp) -; CHECK-NEXT: li a1, 431 -; CHECK-NEXT: vslide1down.vx v8, v8, a0 -; CHECK-NEXT: lbu a0, 1012(sp) +; CHECK-NEXT: li a0, 431 +; CHECK-NEXT: vmv.v.x v16, a2 +; CHECK-NEXT: lbu a1, 770(sp) +; CHECK-NEXT: vslide1down.vx v16, v16, a1 +; CHECK-NEXT: lbu a1, 1012(sp) ; CHECK-NEXT: vsetvli zero, a3, e8, m8, tu, ma -; CHECK-NEXT: vslideup.vx v8, v24, a1 +; CHECK-NEXT: vslideup.vx v16, v24, a0 ; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v24, v16, 4 -; CHECK-NEXT: li a1, 466 -; CHECK-NEXT: vmv.s.x v16, a0 -; CHECK-NEXT: li a0, 465 +; CHECK-NEXT: vslidedown.vi v24, v8, 4 +; CHECK-NEXT: li a0, 466 +; CHECK-NEXT: vmv.s.x v8, a1 +; CHECK-NEXT: li a1, 465 ; CHECK-NEXT: li a2, 501 -; CHECK-NEXT: vsetvli zero, a1, e8, m8, tu, ma -; CHECK-NEXT: vslideup.vx v8, v24, a0 +; CHECK-NEXT: vsetvli zero, a0, e8, m8, tu, ma +; CHECK-NEXT: vslideup.vx v16, v24, a1 ; CHECK-NEXT: li a0, 500 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, tu, ma -; CHECK-NEXT: vslideup.vx v8, v16, a0 +; CHECK-NEXT: vslideup.vx v16, v8, a0 +; CHECK-NEXT: vmv8r.v v8, v16 ; CHECK-NEXT: addi sp, s0, -1536 ; CHECK-NEXT: .cfi_def_cfa sp, 1536 ; CHECK-NEXT: ld ra, 1528(sp) # 8-byte Folded Reload @@ -137,16 +137,16 @@ define <512 x i8> @two_source(<512 x i8> %a, <512 x i8> %b) { ; CHECK-NEXT: vsetvli zero, a2, e8, m8, tu, ma ; CHECK-NEXT: vslideup.vx v8, v24, a1 ; CHECK-NEXT: li a1, 501 +; CHECK-NEXT: lui a2, %hi(.LCPI2_0) +; CHECK-NEXT: addi a2, a2, %lo(.LCPI2_0) +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; CHECK-NEXT: vle8.v v16, (a2) ; CHECK-NEXT: lui a2, %hi(.LCPI2_1) ; CHECK-NEXT: addi a2, a2, %lo(.LCPI2_1) ; CHECK-NEXT: vsetivli zero, 8, e64, m1, ta, ma ; CHECK-NEXT: vle64.v v0, (a2) ; CHECK-NEXT: li a2, 500 ; CHECK-NEXT: vmv.s.x v24, a3 -; CHECK-NEXT: lui a3, %hi(.LCPI2_0) -; CHECK-NEXT: addi a3, a3, %lo(.LCPI2_0) -; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma -; CHECK-NEXT: vle8.v v16, (a3) ; CHECK-NEXT: vsetvli zero, a1, e8, m8, tu, ma ; CHECK-NEXT: vslideup.vx v8, v24, a2 ; CHECK-NEXT: addi a1, sp, 1520 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll index f3406b5c470ce..43291fd219be2 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll @@ -502,17 +502,17 @@ define <8 x i32> @add_constant_rhs_8xi32_vector_in2(<8 x i32> %vin, i32 %a, i32 ; CHECK-NEXT: addi a1, a1, 25 ; CHECK-NEXT: addi a2, a2, 1 ; CHECK-NEXT: addi a3, a3, 2047 -; CHECK-NEXT: addi a3, a3, 308 ; CHECK-NEXT: vsetivli zero, 5, e32, m2, tu, ma ; CHECK-NEXT: vmv.s.x v10, a0 ; CHECK-NEXT: vslideup.vi v8, v10, 4 ; CHECK-NEXT: vmv.s.x v10, a1 ; CHECK-NEXT: vsetivli zero, 6, e32, m2, tu, ma ; CHECK-NEXT: vslideup.vi v8, v10, 5 +; CHECK-NEXT: addi a0, a3, 308 ; CHECK-NEXT: vmv.s.x v10, a2 ; CHECK-NEXT: vsetivli zero, 7, e32, m2, tu, ma ; CHECK-NEXT: vslideup.vi v8, v10, 6 -; CHECK-NEXT: vmv.s.x v10, a3 +; CHECK-NEXT: vmv.s.x v10, a0 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vslideup.vi v8, v10, 7 ; CHECK-NEXT: ret @@ -534,15 +534,15 @@ define <8 x i32> @add_constant_rhs_8xi32_vector_in3(<8 x i32> %vin, i32 %a, i32 ; CHECK-NEXT: addi a1, a1, 25 ; CHECK-NEXT: addi a2, a2, 1 ; CHECK-NEXT: addi a3, a3, 2047 -; CHECK-NEXT: addi a3, a3, 308 ; CHECK-NEXT: vsetivli zero, 3, e32, m1, tu, ma ; CHECK-NEXT: vmv.s.x v8, a0 ; CHECK-NEXT: vmv.s.x v10, a1 ; CHECK-NEXT: vslideup.vi v8, v10, 2 +; CHECK-NEXT: addi a0, a3, 308 ; CHECK-NEXT: vmv.s.x v10, a2 ; CHECK-NEXT: vsetivli zero, 5, e32, m2, tu, ma ; CHECK-NEXT: vslideup.vi v8, v10, 4 -; CHECK-NEXT: vmv.s.x v10, a3 +; CHECK-NEXT: vmv.s.x v10, a0 ; CHECK-NEXT: vsetivli zero, 7, e32, m2, tu, ma ; CHECK-NEXT: vslideup.vi v8, v10, 6 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll index 248ec1369076b..e881360950495 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll @@ -1267,19 +1267,16 @@ define <32 x double> @buildvec_v32f64(double %e0, double %e1, double %e2, double define <32 x double> @buildvec_v32f64_exact_vlen(double %e0, double %e1, double %e2, double %e3, double %e4, double %e5, double %e6, double %e7, double %e8, double %e9, double %e10, double %e11, double %e12, double %e13, double %e14, double %e15, double %e16, double %e17, double %e18, double %e19, double %e20, double %e21, double %e22, double %e23, double %e24, double %e25, double %e26, double %e27, double %e28, double %e29, double %e30, double %e31) vscale_range(2,2) { ; RV32-LABEL: buildvec_v32f64_exact_vlen: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -96 -; RV32-NEXT: .cfi_def_cfa_offset 96 -; RV32-NEXT: fsd fs0, 88(sp) # 8-byte Folded Spill -; RV32-NEXT: fsd fs1, 80(sp) # 8-byte Folded Spill -; RV32-NEXT: fsd fs2, 72(sp) # 8-byte Folded Spill -; RV32-NEXT: fsd fs3, 64(sp) # 8-byte Folded Spill -; RV32-NEXT: fsd fs4, 56(sp) # 8-byte Folded Spill -; RV32-NEXT: fsd fs5, 48(sp) # 8-byte Folded Spill -; RV32-NEXT: fsd fs6, 40(sp) # 8-byte Folded Spill -; RV32-NEXT: fsd fs7, 32(sp) # 8-byte Folded Spill -; RV32-NEXT: fsd fs8, 24(sp) # 8-byte Folded Spill -; RV32-NEXT: fsd fs9, 16(sp) # 8-byte Folded Spill -; RV32-NEXT: fsd fs10, 8(sp) # 8-byte Folded Spill +; RV32-NEXT: addi sp, sp, -80 +; RV32-NEXT: .cfi_def_cfa_offset 80 +; RV32-NEXT: fsd fs0, 72(sp) # 8-byte Folded Spill +; RV32-NEXT: fsd fs1, 64(sp) # 8-byte Folded Spill +; RV32-NEXT: fsd fs2, 56(sp) # 8-byte Folded Spill +; RV32-NEXT: fsd fs3, 48(sp) # 8-byte Folded Spill +; RV32-NEXT: fsd fs4, 40(sp) # 8-byte Folded Spill +; RV32-NEXT: fsd fs5, 32(sp) # 8-byte Folded Spill +; RV32-NEXT: fsd fs6, 24(sp) # 8-byte Folded Spill +; RV32-NEXT: fsd fs7, 16(sp) # 8-byte Folded Spill ; RV32-NEXT: .cfi_offset fs0, -8 ; RV32-NEXT: .cfi_offset fs1, -16 ; RV32-NEXT: .cfi_offset fs2, -24 @@ -1288,85 +1285,79 @@ define <32 x double> @buildvec_v32f64_exact_vlen(double %e0, double %e1, double ; RV32-NEXT: .cfi_offset fs5, -48 ; RV32-NEXT: .cfi_offset fs6, -56 ; RV32-NEXT: .cfi_offset fs7, -64 -; RV32-NEXT: .cfi_offset fs8, -72 -; RV32-NEXT: .cfi_offset fs9, -80 -; RV32-NEXT: .cfi_offset fs10, -88 -; RV32-NEXT: sw a6, 0(sp) -; RV32-NEXT: sw a7, 4(sp) -; RV32-NEXT: fld ft0, 248(sp) -; RV32-NEXT: fld ft1, 240(sp) -; RV32-NEXT: fld ft2, 232(sp) -; RV32-NEXT: fld ft3, 224(sp) -; RV32-NEXT: fld ft6, 216(sp) -; RV32-NEXT: fld ft8, 208(sp) -; RV32-NEXT: fld ft10, 200(sp) -; RV32-NEXT: fld fs1, 192(sp) -; RV32-NEXT: fld ft11, 184(sp) -; RV32-NEXT: fld fs4, 176(sp) -; RV32-NEXT: fld fs2, 168(sp) -; RV32-NEXT: fld fs5, 160(sp) -; RV32-NEXT: fld fs3, 136(sp) -; RV32-NEXT: fld fs6, 128(sp) -; RV32-NEXT: fld fs7, 152(sp) -; RV32-NEXT: fld fs8, 144(sp) -; RV32-NEXT: fld ft4, 120(sp) -; RV32-NEXT: fld ft5, 112(sp) -; RV32-NEXT: fld ft7, 104(sp) -; RV32-NEXT: fld ft9, 96(sp) +; RV32-NEXT: sw a6, 8(sp) +; RV32-NEXT: sw a7, 12(sp) +; RV32-NEXT: fld ft0, 232(sp) +; RV32-NEXT: fld ft4, 224(sp) +; RV32-NEXT: fld ft1, 216(sp) +; RV32-NEXT: fld ft7, 208(sp) +; RV32-NEXT: fld ft2, 200(sp) +; RV32-NEXT: fld ft10, 192(sp) +; RV32-NEXT: fld ft3, 184(sp) +; RV32-NEXT: fld fs1, 176(sp) +; RV32-NEXT: fld ft5, 168(sp) +; RV32-NEXT: fld fs2, 160(sp) +; RV32-NEXT: fld ft6, 152(sp) +; RV32-NEXT: fld fs3, 144(sp) +; RV32-NEXT: fld ft8, 120(sp) +; RV32-NEXT: fld fs4, 112(sp) +; RV32-NEXT: fld ft9, 136(sp) +; RV32-NEXT: fld fs5, 128(sp) +; RV32-NEXT: fld ft11, 104(sp) +; RV32-NEXT: fld fs6, 96(sp) +; RV32-NEXT: fld fs0, 88(sp) +; RV32-NEXT: fld fs7, 80(sp) ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-NEXT: vfmv.v.f v8, fa2 -; RV32-NEXT: fld fa2, 0(sp) -; RV32-NEXT: sw a4, 0(sp) -; RV32-NEXT: sw a5, 4(sp) -; RV32-NEXT: fld fs0, 0(sp) -; RV32-NEXT: sw a2, 0(sp) -; RV32-NEXT: sw a3, 4(sp) -; RV32-NEXT: fld fs9, 0(sp) -; RV32-NEXT: sw a0, 0(sp) -; RV32-NEXT: sw a1, 4(sp) -; RV32-NEXT: fld fs10, 0(sp) -; RV32-NEXT: vfmv.v.f v9, fs8 -; RV32-NEXT: vfmv.v.f v10, fs6 -; RV32-NEXT: vfmv.v.f v11, fs5 -; RV32-NEXT: vfmv.v.f v12, fs4 -; RV32-NEXT: vfmv.v.f v13, fs1 -; RV32-NEXT: vfslide1down.vf v17, v9, fs7 -; RV32-NEXT: vfslide1down.vf v16, v10, fs3 -; RV32-NEXT: vfslide1down.vf v18, v11, fs2 -; RV32-NEXT: vfmv.v.f v9, fs10 -; RV32-NEXT: vfslide1down.vf v19, v12, ft11 -; RV32-NEXT: vfslide1down.vf v20, v13, ft10 -; RV32-NEXT: vfslide1down.vf v12, v9, fs9 +; RV32-NEXT: vfmv.v.f v10, fa0 +; RV32-NEXT: vfmv.v.f v11, fa4 +; RV32-NEXT: vfmv.v.f v12, fa6 +; RV32-NEXT: fld fa4, 8(sp) +; RV32-NEXT: sw a4, 8(sp) +; RV32-NEXT: sw a5, 12(sp) ; RV32-NEXT: vfslide1down.vf v9, v8, fa3 -; RV32-NEXT: vfmv.v.f v8, ft8 -; RV32-NEXT: vfslide1down.vf v21, v8, ft6 -; RV32-NEXT: vfmv.v.f v8, fa0 -; RV32-NEXT: vfslide1down.vf v8, v8, fa1 -; RV32-NEXT: vfmv.v.f v10, ft3 -; RV32-NEXT: vfslide1down.vf v22, v10, ft2 -; RV32-NEXT: vfmv.v.f v10, fa4 -; RV32-NEXT: vfslide1down.vf v10, v10, fa5 -; RV32-NEXT: vfmv.v.f v11, fa6 -; RV32-NEXT: vfslide1down.vf v11, v11, fa7 -; RV32-NEXT: vfmv.v.f v13, fs0 -; RV32-NEXT: vfslide1down.vf v13, v13, fa2 -; RV32-NEXT: vfmv.v.f v14, ft9 -; RV32-NEXT: vfslide1down.vf v14, v14, ft7 -; RV32-NEXT: vfmv.v.f v15, ft5 -; RV32-NEXT: vfslide1down.vf v15, v15, ft4 -; RV32-NEXT: vfmv.v.f v23, ft1 -; RV32-NEXT: vfslide1down.vf v23, v23, ft0 -; RV32-NEXT: fld fs0, 88(sp) # 8-byte Folded Reload -; RV32-NEXT: fld fs1, 80(sp) # 8-byte Folded Reload -; RV32-NEXT: fld fs2, 72(sp) # 8-byte Folded Reload -; RV32-NEXT: fld fs3, 64(sp) # 8-byte Folded Reload -; RV32-NEXT: fld fs4, 56(sp) # 8-byte Folded Reload -; RV32-NEXT: fld fs5, 48(sp) # 8-byte Folded Reload -; RV32-NEXT: fld fs6, 40(sp) # 8-byte Folded Reload -; RV32-NEXT: fld fs7, 32(sp) # 8-byte Folded Reload -; RV32-NEXT: fld fs8, 24(sp) # 8-byte Folded Reload -; RV32-NEXT: fld fs9, 16(sp) # 8-byte Folded Reload -; RV32-NEXT: fld fs10, 8(sp) # 8-byte Folded Reload +; RV32-NEXT: vfslide1down.vf v8, v10, fa1 +; RV32-NEXT: vfslide1down.vf v10, v11, fa5 +; RV32-NEXT: vfslide1down.vf v11, v12, fa7 +; RV32-NEXT: fld fa5, 8(sp) +; RV32-NEXT: sw a2, 8(sp) +; RV32-NEXT: sw a3, 12(sp) +; RV32-NEXT: fld fa3, 8(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: fld fa2, 8(sp) +; RV32-NEXT: vfmv.v.f v12, fs7 +; RV32-NEXT: vfmv.v.f v13, fs6 +; RV32-NEXT: vfmv.v.f v16, fs5 +; RV32-NEXT: vfmv.v.f v18, fs4 +; RV32-NEXT: vfmv.v.f v19, fs3 +; RV32-NEXT: vfmv.v.f v20, fs2 +; RV32-NEXT: vfmv.v.f v21, fs1 +; RV32-NEXT: vfmv.v.f v22, ft10 +; RV32-NEXT: vfmv.v.f v23, ft7 +; RV32-NEXT: vfmv.v.f v24, ft4 +; RV32-NEXT: vfslide1down.vf v14, v12, fs0 +; RV32-NEXT: vfslide1down.vf v15, v13, ft11 +; RV32-NEXT: vfslide1down.vf v17, v16, ft9 +; RV32-NEXT: vfslide1down.vf v16, v18, ft8 +; RV32-NEXT: vfslide1down.vf v18, v19, ft6 +; RV32-NEXT: vfslide1down.vf v19, v20, ft5 +; RV32-NEXT: vfslide1down.vf v20, v21, ft3 +; RV32-NEXT: vfslide1down.vf v21, v22, ft2 +; RV32-NEXT: vfslide1down.vf v22, v23, ft1 +; RV32-NEXT: vfmv.v.f v12, fa5 +; RV32-NEXT: vfslide1down.vf v13, v12, fa4 +; RV32-NEXT: vfmv.v.f v12, fa2 +; RV32-NEXT: vfslide1down.vf v12, v12, fa3 +; RV32-NEXT: vfslide1down.vf v23, v24, ft0 +; RV32-NEXT: fld fs0, 72(sp) # 8-byte Folded Reload +; RV32-NEXT: fld fs1, 64(sp) # 8-byte Folded Reload +; RV32-NEXT: fld fs2, 56(sp) # 8-byte Folded Reload +; RV32-NEXT: fld fs3, 48(sp) # 8-byte Folded Reload +; RV32-NEXT: fld fs4, 40(sp) # 8-byte Folded Reload +; RV32-NEXT: fld fs5, 32(sp) # 8-byte Folded Reload +; RV32-NEXT: fld fs6, 24(sp) # 8-byte Folded Reload +; RV32-NEXT: fld fs7, 16(sp) # 8-byte Folded Reload ; RV32-NEXT: .cfi_restore fs0 ; RV32-NEXT: .cfi_restore fs1 ; RV32-NEXT: .cfi_restore fs2 @@ -1375,10 +1366,7 @@ define <32 x double> @buildvec_v32f64_exact_vlen(double %e0, double %e1, double ; RV32-NEXT: .cfi_restore fs5 ; RV32-NEXT: .cfi_restore fs6 ; RV32-NEXT: .cfi_restore fs7 -; RV32-NEXT: .cfi_restore fs8 -; RV32-NEXT: .cfi_restore fs9 -; RV32-NEXT: .cfi_restore fs10 -; RV32-NEXT: addi sp, sp, 96 +; RV32-NEXT: addi sp, sp, 80 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; @@ -1402,25 +1390,25 @@ define <32 x double> @buildvec_v32f64_exact_vlen(double %e0, double %e1, double ; RV64-NEXT: .cfi_offset fs5, -48 ; RV64-NEXT: .cfi_offset fs6, -56 ; RV64-NEXT: .cfi_offset fs7, -64 -; RV64-NEXT: fmv.d.x ft6, a7 -; RV64-NEXT: fmv.d.x ft9, a5 -; RV64-NEXT: fmv.d.x ft10, a3 -; RV64-NEXT: fmv.d.x ft11, a1 +; RV64-NEXT: fmv.d.x ft11, a7 +; RV64-NEXT: fmv.d.x fs0, a5 +; RV64-NEXT: fmv.d.x fs1, a3 +; RV64-NEXT: fmv.d.x fs2, a1 ; RV64-NEXT: fld ft0, 184(sp) -; RV64-NEXT: fld ft1, 176(sp) -; RV64-NEXT: fld ft2, 168(sp) -; RV64-NEXT: fld ft3, 160(sp) -; RV64-NEXT: fld ft4, 152(sp) -; RV64-NEXT: fld ft5, 144(sp) -; RV64-NEXT: fld ft7, 136(sp) -; RV64-NEXT: fld ft8, 128(sp) -; RV64-NEXT: fld fs0, 120(sp) -; RV64-NEXT: fld fs1, 112(sp) -; RV64-NEXT: fld fs2, 104(sp) -; RV64-NEXT: fld fs3, 96(sp) -; RV64-NEXT: fld fs4, 72(sp) -; RV64-NEXT: fld fs5, 64(sp) -; RV64-NEXT: fld fs6, 88(sp) +; RV64-NEXT: fld ft3, 176(sp) +; RV64-NEXT: fld ft1, 168(sp) +; RV64-NEXT: fld ft6, 160(sp) +; RV64-NEXT: fld ft2, 152(sp) +; RV64-NEXT: fld ft9, 144(sp) +; RV64-NEXT: fld ft4, 136(sp) +; RV64-NEXT: fld fs3, 128(sp) +; RV64-NEXT: fld ft5, 120(sp) +; RV64-NEXT: fld fs4, 112(sp) +; RV64-NEXT: fld ft7, 104(sp) +; RV64-NEXT: fld fs5, 96(sp) +; RV64-NEXT: fld ft8, 72(sp) +; RV64-NEXT: fld fs6, 64(sp) +; RV64-NEXT: fld ft10, 88(sp) ; RV64-NEXT: fld fs7, 80(sp) ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-NEXT: vfmv.v.f v8, fa2 @@ -1429,32 +1417,32 @@ define <32 x double> @buildvec_v32f64_exact_vlen(double %e0, double %e1, double ; RV64-NEXT: vfmv.v.f v12, fa6 ; RV64-NEXT: vmv.v.x v13, a0 ; RV64-NEXT: vmv.v.x v14, a2 +; RV64-NEXT: vmv.v.x v15, a4 +; RV64-NEXT: vmv.v.x v16, a6 ; RV64-NEXT: vfslide1down.vf v9, v8, fa3 ; RV64-NEXT: vfslide1down.vf v8, v10, fa1 ; RV64-NEXT: vfslide1down.vf v10, v11, fa5 ; RV64-NEXT: vfslide1down.vf v11, v12, fa7 -; RV64-NEXT: vfmv.v.f v15, fs7 -; RV64-NEXT: vfmv.v.f v16, fs5 -; RV64-NEXT: vfslide1down.vf v12, v13, ft11 -; RV64-NEXT: vfslide1down.vf v13, v14, ft10 -; RV64-NEXT: vfslide1down.vf v17, v15, fs6 -; RV64-NEXT: vfslide1down.vf v16, v16, fs4 -; RV64-NEXT: vmv.v.x v14, a4 -; RV64-NEXT: vfslide1down.vf v14, v14, ft9 -; RV64-NEXT: vfmv.v.f v15, fs3 -; RV64-NEXT: vfslide1down.vf v18, v15, fs2 -; RV64-NEXT: vmv.v.x v15, a6 -; RV64-NEXT: vfslide1down.vf v15, v15, ft6 -; RV64-NEXT: vfmv.v.f v19, fs1 -; RV64-NEXT: vfslide1down.vf v19, v19, fs0 -; RV64-NEXT: vfmv.v.f v20, ft8 -; RV64-NEXT: vfslide1down.vf v20, v20, ft7 -; RV64-NEXT: vfmv.v.f v21, ft5 -; RV64-NEXT: vfslide1down.vf v21, v21, ft4 -; RV64-NEXT: vfmv.v.f v22, ft3 -; RV64-NEXT: vfslide1down.vf v22, v22, ft2 -; RV64-NEXT: vfmv.v.f v23, ft1 -; RV64-NEXT: vfslide1down.vf v23, v23, ft0 +; RV64-NEXT: vfslide1down.vf v12, v13, fs2 +; RV64-NEXT: vfslide1down.vf v13, v14, fs1 +; RV64-NEXT: vfslide1down.vf v14, v15, fs0 +; RV64-NEXT: vfslide1down.vf v15, v16, ft11 +; RV64-NEXT: vfmv.v.f v16, fs7 +; RV64-NEXT: vfmv.v.f v18, fs6 +; RV64-NEXT: vfmv.v.f v19, fs5 +; RV64-NEXT: vfmv.v.f v20, fs4 +; RV64-NEXT: vfmv.v.f v21, fs3 +; RV64-NEXT: vfmv.v.f v22, ft9 +; RV64-NEXT: vfmv.v.f v23, ft6 +; RV64-NEXT: vfmv.v.f v24, ft3 +; RV64-NEXT: vfslide1down.vf v17, v16, ft10 +; RV64-NEXT: vfslide1down.vf v16, v18, ft8 +; RV64-NEXT: vfslide1down.vf v18, v19, ft7 +; RV64-NEXT: vfslide1down.vf v19, v20, ft5 +; RV64-NEXT: vfslide1down.vf v20, v21, ft4 +; RV64-NEXT: vfslide1down.vf v21, v22, ft2 +; RV64-NEXT: vfslide1down.vf v22, v23, ft1 +; RV64-NEXT: vfslide1down.vf v23, v24, ft0 ; RV64-NEXT: fld fs0, 56(sp) # 8-byte Folded Reload ; RV64-NEXT: fld fs1, 48(sp) # 8-byte Folded Reload ; RV64-NEXT: fld fs2, 40(sp) # 8-byte Folded Reload @@ -1911,14 +1899,14 @@ define <8 x float> @buildvec_vfredusum_slideup_not_extract_first(float %start, < ; CHECK-LABEL: buildvec_vfredusum_slideup_not_extract_first: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vfmv.s.f v10, fa0 -; CHECK-NEXT: vfredusum.vs v8, v8, v10 -; CHECK-NEXT: vfredusum.vs v9, v12, v10 -; CHECK-NEXT: vfredusum.vs v10, v14, v10 -; CHECK-NEXT: vfmv.f.s fa5, v9 -; CHECK-NEXT: vfmv.f.s fa4, v10 -; CHECK-NEXT: vrgather.vi v10, v8, 0 -; CHECK-NEXT: vfslide1down.vf v8, v10, fa0 +; CHECK-NEXT: vfmv.s.f v16, fa0 +; CHECK-NEXT: vfredusum.vs v10, v8, v16 +; CHECK-NEXT: vfredusum.vs v8, v12, v16 +; CHECK-NEXT: vfredusum.vs v9, v14, v16 +; CHECK-NEXT: vfmv.f.s fa5, v8 +; CHECK-NEXT: vfmv.f.s fa4, v9 +; CHECK-NEXT: vrgather.vi v8, v10, 0 +; CHECK-NEXT: vfslide1down.vf v8, v8, fa0 ; CHECK-NEXT: vfslide1down.vf v8, v8, fa5 ; CHECK-NEXT: vfslide1down.vf v8, v8, fa4 ; CHECK-NEXT: vslidedown.vi v8, v8, 4 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptrunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptrunc-vp.ll index 13891cb84e0f2..35fe2f4ee0976 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptrunc-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fptrunc-vp.ll @@ -112,11 +112,11 @@ define <32 x float> @vfptrunc_v32f32_v32f64(<32 x double> %a, <32 x i1> %m, i32 ; CHECK-NEXT: sltu a0, a0, a1 ; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: and a0, a0, a1 +; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma ; CHECK-NEXT: vfncvt.f.f.w v24, v16, v0.t -; CHECK-NEXT: li a0, 32 -; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; CHECK-NEXT: vslideup.vi v8, v24, 16 ; CHECK-NEXT: ret %v = call <32 x float> @llvm.vp.fptrunc.v32f64.v32f32(<32 x double> %a, <32 x i1> %m, i32 %vl) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll index 00328f9d33d3e..f503691001bc2 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll @@ -285,14 +285,14 @@ define void @insert_v8i32_v2i32_0(ptr %vp, ptr %svp) { define void @insert_v8i32_v2i32_2(ptr %vp, ptr %svp) { ; VLA-LABEL: insert_v8i32_v2i32_2: ; VLA: # %bb.0: -; VLA-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; VLA-NEXT: vle32.v v8, (a0) ; VLA-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; VLA-NEXT: vle32.v v10, (a1) +; VLA-NEXT: vle32.v v8, (a1) +; VLA-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; VLA-NEXT: vle32.v v10, (a0) ; VLA-NEXT: vsetivli zero, 4, e32, m2, tu, ma -; VLA-NEXT: vslideup.vi v8, v10, 2 +; VLA-NEXT: vslideup.vi v10, v8, 2 ; VLA-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; VLA-NEXT: vse32.v v8, (a0) +; VLA-NEXT: vse32.v v10, (a0) ; VLA-NEXT: ret ; ; VLS-LABEL: insert_v8i32_v2i32_2: @@ -314,13 +314,12 @@ define void @insert_v8i32_v2i32_2(ptr %vp, ptr %svp) { define void @insert_v8i32_v2i32_6(ptr %vp, ptr %svp) { ; VLA-LABEL: insert_v8i32_v2i32_6: ; VLA: # %bb.0: -; VLA-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; VLA-NEXT: vle32.v v8, (a0) ; VLA-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; VLA-NEXT: vle32.v v10, (a1) +; VLA-NEXT: vle32.v v8, (a1) ; VLA-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; VLA-NEXT: vslideup.vi v8, v10, 6 -; VLA-NEXT: vse32.v v8, (a0) +; VLA-NEXT: vle32.v v10, (a0) +; VLA-NEXT: vslideup.vi v10, v8, 6 +; VLA-NEXT: vse32.v v10, (a0) ; VLA-NEXT: ret ; ; VLS-LABEL: insert_v8i32_v2i32_6: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll index 4bec67d91847d..090bd883f1a34 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll @@ -1104,20 +1104,20 @@ define <4 x i64> @v4xi64_exact(i64 %a, i64 %b, i64 %c, i64 %d) vscale_range(2,2) ; RV32-NEXT: vmv.v.x v8, a4 ; RV32-NEXT: vmv.v.x v9, a0 ; RV32-NEXT: vslide1down.vx v8, v8, a5 -; RV32-NEXT: vslide1down.vx v10, v9, a1 +; RV32-NEXT: vslide1down.vx v9, v9, a1 ; RV32-NEXT: vslide1down.vx v8, v8, a6 +; RV32-NEXT: vslide1down.vx v10, v9, a2 ; RV32-NEXT: vslide1down.vx v9, v8, a7 -; RV32-NEXT: vslide1down.vx v8, v10, a2 -; RV32-NEXT: vslide1down.vx v8, v8, a3 +; RV32-NEXT: vslide1down.vx v8, v10, a3 ; RV32-NEXT: ret ; ; RV64V-LABEL: v4xi64_exact: ; RV64V: # %bb.0: ; RV64V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64V-NEXT: vmv.v.x v8, a2 +; RV64V-NEXT: vmv.v.x v10, a0 ; RV64V-NEXT: vslide1down.vx v9, v8, a3 -; RV64V-NEXT: vmv.v.x v8, a0 -; RV64V-NEXT: vslide1down.vx v8, v8, a1 +; RV64V-NEXT: vslide1down.vx v8, v10, a1 ; RV64V-NEXT: ret ; ; RV64ZVE32-LABEL: v4xi64_exact: @@ -1159,12 +1159,12 @@ define <8 x i64> @v8xi64_exact(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i64 %f, i ; RV32-NEXT: vslide1down.vx v9, v8, a7 ; RV32-NEXT: vslide1down.vx v8, v10, a3 ; RV32-NEXT: vmv.v.x v10, s0 -; RV32-NEXT: vslide1down.vx v10, v10, t6 -; RV32-NEXT: vslide1down.vx v10, v10, t5 -; RV32-NEXT: vslide1down.vx v10, v10, t4 ; RV32-NEXT: vmv.v.x v11, t3 +; RV32-NEXT: vslide1down.vx v10, v10, t6 ; RV32-NEXT: vslide1down.vx v11, v11, t2 +; RV32-NEXT: vslide1down.vx v10, v10, t5 ; RV32-NEXT: vslide1down.vx v11, v11, t1 +; RV32-NEXT: vslide1down.vx v10, v10, t4 ; RV32-NEXT: vslide1down.vx v11, v11, t0 ; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32-NEXT: .cfi_restore s0 @@ -1177,12 +1177,12 @@ define <8 x i64> @v8xi64_exact(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i64 %f, i ; RV64V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64V-NEXT: vmv.v.x v8, a2 ; RV64V-NEXT: vmv.v.x v10, a0 +; RV64V-NEXT: vmv.v.x v11, a4 +; RV64V-NEXT: vmv.v.x v12, a6 ; RV64V-NEXT: vslide1down.vx v9, v8, a3 ; RV64V-NEXT: vslide1down.vx v8, v10, a1 -; RV64V-NEXT: vmv.v.x v10, a4 -; RV64V-NEXT: vslide1down.vx v10, v10, a5 -; RV64V-NEXT: vmv.v.x v11, a6 -; RV64V-NEXT: vslide1down.vx v11, v11, a7 +; RV64V-NEXT: vslide1down.vx v10, v11, a5 +; RV64V-NEXT: vslide1down.vx v11, v12, a7 ; RV64V-NEXT: ret ; ; RV64ZVE32-LABEL: v8xi64_exact: @@ -1215,11 +1215,11 @@ define <8 x i64> @v8xi64_exact_equal_halves(i64 %a, i64 %b, i64 %c, i64 %d) vsca ; RV32-NEXT: vmv.v.x v8, a4 ; RV32-NEXT: vmv.v.x v9, a0 ; RV32-NEXT: vslide1down.vx v8, v8, a5 -; RV32-NEXT: vslide1down.vx v10, v9, a1 +; RV32-NEXT: vslide1down.vx v9, v9, a1 ; RV32-NEXT: vslide1down.vx v8, v8, a6 +; RV32-NEXT: vslide1down.vx v10, v9, a2 ; RV32-NEXT: vslide1down.vx v9, v8, a7 -; RV32-NEXT: vslide1down.vx v8, v10, a2 -; RV32-NEXT: vslide1down.vx v8, v8, a3 +; RV32-NEXT: vslide1down.vx v8, v10, a3 ; RV32-NEXT: vmv.v.v v10, v8 ; RV32-NEXT: vmv.v.v v11, v9 ; RV32-NEXT: ret @@ -1228,9 +1228,9 @@ define <8 x i64> @v8xi64_exact_equal_halves(i64 %a, i64 %b, i64 %c, i64 %d) vsca ; RV64V: # %bb.0: ; RV64V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64V-NEXT: vmv.v.x v8, a2 +; RV64V-NEXT: vmv.v.x v10, a0 ; RV64V-NEXT: vslide1down.vx v9, v8, a3 -; RV64V-NEXT: vmv.v.x v8, a0 -; RV64V-NEXT: vslide1down.vx v8, v8, a1 +; RV64V-NEXT: vslide1down.vx v8, v10, a1 ; RV64V-NEXT: vmv.v.v v10, v8 ; RV64V-NEXT: vmv.v.v v11, v9 ; RV64V-NEXT: ret @@ -1264,20 +1264,20 @@ define <8 x i64> @v8xi64_exact_undef_suffix(i64 %a, i64 %b, i64 %c, i64 %d) vsca ; RV32-NEXT: vmv.v.x v8, a4 ; RV32-NEXT: vmv.v.x v9, a0 ; RV32-NEXT: vslide1down.vx v8, v8, a5 -; RV32-NEXT: vslide1down.vx v10, v9, a1 +; RV32-NEXT: vslide1down.vx v9, v9, a1 ; RV32-NEXT: vslide1down.vx v8, v8, a6 +; RV32-NEXT: vslide1down.vx v10, v9, a2 ; RV32-NEXT: vslide1down.vx v9, v8, a7 -; RV32-NEXT: vslide1down.vx v8, v10, a2 -; RV32-NEXT: vslide1down.vx v8, v8, a3 +; RV32-NEXT: vslide1down.vx v8, v10, a3 ; RV32-NEXT: ret ; ; RV64V-LABEL: v8xi64_exact_undef_suffix: ; RV64V: # %bb.0: ; RV64V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64V-NEXT: vmv.v.x v8, a2 +; RV64V-NEXT: vmv.v.x v10, a0 ; RV64V-NEXT: vslide1down.vx v9, v8, a3 -; RV64V-NEXT: vmv.v.x v8, a0 -; RV64V-NEXT: vslide1down.vx v8, v8, a1 +; RV64V-NEXT: vslide1down.vx v8, v10, a1 ; RV64V-NEXT: ret ; ; RV64ZVE32-LABEL: v8xi64_exact_undef_suffix: @@ -1303,18 +1303,18 @@ define <8 x i64> @v8xi64_exact_undef_prefix(i64 %a, i64 %b, i64 %c, i64 %d) vsca ; RV32-NEXT: vslide1down.vx v8, v8, a5 ; RV32-NEXT: vslide1down.vx v9, v9, a1 ; RV32-NEXT: vslide1down.vx v8, v8, a6 +; RV32-NEXT: vslide1down.vx v9, v9, a2 ; RV32-NEXT: vslide1down.vx v11, v8, a7 -; RV32-NEXT: vslide1down.vx v8, v9, a2 -; RV32-NEXT: vslide1down.vx v10, v8, a3 +; RV32-NEXT: vslide1down.vx v10, v9, a3 ; RV32-NEXT: ret ; ; RV64V-LABEL: v8xi64_exact_undef_prefix: ; RV64V: # %bb.0: ; RV64V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64V-NEXT: vmv.v.x v8, a2 +; RV64V-NEXT: vmv.v.x v9, a0 ; RV64V-NEXT: vslide1down.vx v11, v8, a3 -; RV64V-NEXT: vmv.v.x v8, a0 -; RV64V-NEXT: vslide1down.vx v10, v8, a1 +; RV64V-NEXT: vslide1down.vx v10, v9, a1 ; RV64V-NEXT: ret ; ; RV64ZVE32-LABEL: v8xi64_exact_undef_prefix: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll index 1d691b130b3da..8e8b510f720a5 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll @@ -1892,9 +1892,9 @@ define void @load_factor4_one_active_storeback_full(ptr %ptr) { ; CHECK-NEXT: vsetivli zero, 4, e32, m2, ta, ma ; CHECK-NEXT: vslidedown.vi v12, v8, 4 ; CHECK-NEXT: vmv1r.v v13, v8 -; CHECK-NEXT: vmv1r.v v14, v12 ; CHECK-NEXT: vsetivli zero, 4, e32, m4, ta, ma ; CHECK-NEXT: vslidedown.vi v16, v8, 8 +; CHECK-NEXT: vmv1r.v v14, v12 ; CHECK-NEXT: vmv1r.v v15, v16 ; CHECK-NEXT: vslidedown.vi v16, v8, 12 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-changes-length.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-changes-length.ll index c05f306424519..278055f17de37 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-changes-length.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-changes-length.ll @@ -257,11 +257,11 @@ define <16 x i32> @v16i32_v4i32(<4 x i32>) { ; CHECK-NEXT: vsext.vf2 v10, v8 ; CHECK-NEXT: vslidedown.vx v14, v10, a1 ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v8, v12, v10 ; CHECK-NEXT: vrgatherei16.vv v9, v12, v14 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vslidedown.vx v14, v14, a1 ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma -; CHECK-NEXT: vrgatherei16.vv v8, v12, v10 ; CHECK-NEXT: vrgatherei16.vv v10, v12, v14 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vslidedown.vx v14, v14, a1 @@ -302,27 +302,27 @@ define <32 x i32> @v32i32_v4i32(<4 x i32>) { ; CHECK-NEXT: vsext.vf2 v12, v8 ; CHECK-NEXT: vslidedown.vx v20, v12, a1 ; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma -; CHECK-NEXT: vrgatherei16.vv v9, v16, v20 -; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vslidedown.vx v20, v20, a1 -; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma ; CHECK-NEXT: vrgatherei16.vv v8, v16, v12 +; CHECK-NEXT: vrgatherei16.vv v9, v16, v20 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; CHECK-NEXT: vslidedown.vx v12, v20, a1 ; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma -; CHECK-NEXT: vrgatherei16.vv v10, v16, v20 +; CHECK-NEXT: vrgatherei16.vv v10, v16, v12 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vslidedown.vx v20, v12, a1 +; CHECK-NEXT: vslidedown.vx v12, v12, a1 ; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma ; CHECK-NEXT: vrgatherei16.vv v11, v16, v12 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vslidedown.vx v24, v20, a1 +; CHECK-NEXT: vslidedown.vx v20, v12, a1 ; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma ; CHECK-NEXT: vrgatherei16.vv v12, v16, v20 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vslidedown.vx v20, v24, a1 +; CHECK-NEXT: vslidedown.vx v20, v20, a1 +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v13, v16, v20 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vslidedown.vx v20, v20, a1 ; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma -; CHECK-NEXT: vrgatherei16.vv v13, v16, v24 ; CHECK-NEXT: vrgatherei16.vv v14, v16, v20 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; CHECK-NEXT: vslidedown.vx v20, v20, a1 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll index f2353e7d028bd..a3445959f687d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll @@ -24,7 +24,8 @@ define <8 x i32> @concat_4xv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x ; VLA-LABEL: concat_4xv2i32: ; VLA: # %bb.0: ; VLA-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; VLA-NEXT: vslideup.vi v10, v11, 2 +; VLA-NEXT: vmv1r.v v12, v11 +; VLA-NEXT: vslideup.vi v10, v12, 2 ; VLA-NEXT: vslideup.vi v8, v9, 2 ; VLA-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; VLA-NEXT: vslideup.vi v8, v10, 4 @@ -49,8 +50,9 @@ define <8 x i32> @concat_8xv1i32(<1 x i32> %a, <1 x i32> %b, <1 x i32> %c, <1 x ; VLA-LABEL: concat_8xv1i32: ; VLA: # %bb.0: ; VLA-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; VLA-NEXT: vmv1r.v v16, v13 ; VLA-NEXT: vslideup.vi v14, v15, 1 -; VLA-NEXT: vslideup.vi v12, v13, 1 +; VLA-NEXT: vslideup.vi v12, v16, 1 ; VLA-NEXT: vslideup.vi v10, v11, 1 ; VLA-NEXT: vslideup.vi v8, v9, 1 ; VLA-NEXT: vsetivli zero, 4, e32, m1, ta, ma @@ -67,12 +69,10 @@ define <8 x i32> @concat_8xv1i32(<1 x i32> %a, <1 x i32> %b, <1 x i32> %c, <1 x ; VLS-NEXT: vmv1r.v v16, v8 ; VLS-NEXT: vslideup.vi v14, v15, 1 ; VLS-NEXT: vslideup.vi v17, v13, 1 -; VLS-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; VLS-NEXT: vslideup.vi v17, v14, 2 -; VLS-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; VLS-NEXT: vslideup.vi v10, v11, 1 ; VLS-NEXT: vslideup.vi v16, v9, 1 ; VLS-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; VLS-NEXT: vslideup.vi v17, v14, 2 ; VLS-NEXT: vslideup.vi v16, v10, 2 ; VLS-NEXT: vmv2r.v v8, v16 ; VLS-NEXT: ret @@ -127,9 +127,11 @@ define <16 x i32> @concat_8xv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x ; VLA-LABEL: concat_8xv2i32: ; VLA: # %bb.0: ; VLA-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; VLA-NEXT: vslideup.vi v14, v15, 2 +; VLA-NEXT: vmv1r.v v16, v15 +; VLA-NEXT: vmv1r.v v17, v11 +; VLA-NEXT: vslideup.vi v14, v16, 2 ; VLA-NEXT: vslideup.vi v12, v13, 2 -; VLA-NEXT: vslideup.vi v10, v11, 2 +; VLA-NEXT: vslideup.vi v10, v17, 2 ; VLA-NEXT: vslideup.vi v8, v9, 2 ; VLA-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; VLA-NEXT: vslideup.vi v12, v14, 4 @@ -207,15 +209,15 @@ define <32 x i32> @concat_8xv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x ; VLA-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; VLA-NEXT: vmv1r.v v18, v15 ; VLA-NEXT: vmv1r.v v20, v14 -; VLA-NEXT: vmv1r.v v14, v13 +; VLA-NEXT: vmv1r.v v24, v13 ; VLA-NEXT: vmv1r.v v16, v12 -; VLA-NEXT: vmv1r.v v22, v11 +; VLA-NEXT: vmv1r.v v26, v11 ; VLA-NEXT: vmv1r.v v12, v10 ; VLA-NEXT: vmv1r.v v10, v9 ; VLA-NEXT: li a0, 32 ; VLA-NEXT: vslideup.vi v20, v18, 4 -; VLA-NEXT: vslideup.vi v16, v14, 4 -; VLA-NEXT: vslideup.vi v12, v22, 4 +; VLA-NEXT: vslideup.vi v16, v24, 4 +; VLA-NEXT: vslideup.vi v12, v26, 4 ; VLA-NEXT: vslideup.vi v8, v10, 4 ; VLA-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; VLA-NEXT: vslideup.vi v16, v20, 8 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave2.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave2.ll index f80b7bd94490e..6cecf8d1218aa 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave2.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave2.ll @@ -1616,34 +1616,34 @@ define <16 x i64> @unzip2a_dual_v16i64_exact(<16 x i64> %a, <16 x i64> %b) vscal ; V-NEXT: vmv.v.i v16, 8 ; V-NEXT: vmv.v.i v17, 2 ; V-NEXT: vmv.v.i v18, 12 +; V-NEXT: vslideup.vi v20, v13, 2 +; V-NEXT: vslideup.vi v21, v11, 2 +; V-NEXT: vslideup.vi v22, v9, 2 +; V-NEXT: li a0, -256 ; V-NEXT: vmv.v.v v0, v16 ; V-NEXT: vslideup.vi v19, v15, 1, v0.t ; V-NEXT: vmv.v.v v0, v17 ; V-NEXT: vslidedown.vi v14, v14, 1, v0.t ; V-NEXT: vmv.v.v v0, v18 ; V-NEXT: vmerge.vvm v15, v14, v19, v0 -; V-NEXT: vslideup.vi v14, v13, 2 ; V-NEXT: vmv.v.v v0, v16 -; V-NEXT: vslideup.vi v14, v13, 1, v0.t +; V-NEXT: vslideup.vi v20, v13, 1, v0.t ; V-NEXT: vmv.v.v v0, v17 ; V-NEXT: vslidedown.vi v12, v12, 1, v0.t ; V-NEXT: vmv.v.v v0, v18 -; V-NEXT: vmerge.vvm v14, v12, v14, v0 -; V-NEXT: vslideup.vi v12, v11, 2 -; V-NEXT: li a0, -256 +; V-NEXT: vmerge.vvm v14, v12, v20, v0 ; V-NEXT: vmv.v.v v0, v16 -; V-NEXT: vslideup.vi v12, v11, 1, v0.t +; V-NEXT: vslideup.vi v21, v11, 1, v0.t ; V-NEXT: vmv.v.v v0, v17 ; V-NEXT: vslidedown.vi v10, v10, 1, v0.t ; V-NEXT: vmv.v.v v0, v18 -; V-NEXT: vmerge.vvm v13, v10, v12, v0 -; V-NEXT: vslideup.vi v10, v9, 2 +; V-NEXT: vmerge.vvm v13, v10, v21, v0 ; V-NEXT: vmv.v.v v0, v16 -; V-NEXT: vslideup.vi v10, v9, 1, v0.t +; V-NEXT: vslideup.vi v22, v9, 1, v0.t ; V-NEXT: vmv.v.v v0, v17 ; V-NEXT: vslidedown.vi v8, v8, 1, v0.t ; V-NEXT: vmv.v.v v0, v18 -; V-NEXT: vmerge.vvm v12, v8, v10, v0 +; V-NEXT: vmerge.vvm v12, v8, v22, v0 ; V-NEXT: vmv.s.x v0, a0 ; V-NEXT: vsetivli zero, 16, e64, m4, ta, ma ; V-NEXT: vmerge.vvm v8, v12, v12, v0 @@ -1651,11 +1651,11 @@ define <16 x i64> @unzip2a_dual_v16i64_exact(<16 x i64> %a, <16 x i64> %b) vscal ; ; ZVE32F-LABEL: unzip2a_dual_v16i64_exact: ; ZVE32F: # %bb.0: # %entry -; ZVE32F-NEXT: ld a5, 96(a2) +; ZVE32F-NEXT: ld a4, 96(a2) ; ZVE32F-NEXT: ld a7, 0(a1) -; ZVE32F-NEXT: ld a4, 16(a1) +; ZVE32F-NEXT: ld a3, 16(a1) ; ZVE32F-NEXT: ld t0, 32(a1) -; ZVE32F-NEXT: ld a3, 48(a1) +; ZVE32F-NEXT: ld a5, 48(a1) ; ZVE32F-NEXT: ld t1, 64(a1) ; ZVE32F-NEXT: ld a6, 80(a1) ; ZVE32F-NEXT: ld t2, 96(a1) @@ -1672,55 +1672,55 @@ define <16 x i64> @unzip2a_dual_v16i64_exact(<16 x i64> %a, <16 x i64> %b) vscal ; ZVE32F-NEXT: vslide1down.vx v11, v11, t2 ; ZVE32F-NEXT: vslide1down.vx v10, v10, t1 ; ZVE32F-NEXT: vslide1down.vx v12, v9, t0 -; ZVE32F-NEXT: vslide1down.vx v8, v8, a7 -; ZVE32F-NEXT: ld t0, 0(a2) -; ZVE32F-NEXT: ld t1, 16(a2) -; ZVE32F-NEXT: ld t2, 32(a2) -; ZVE32F-NEXT: ld a7, 48(a2) -; ZVE32F-NEXT: vmv.v.x v9, t0 -; ZVE32F-NEXT: srli t0, t0, 32 -; ZVE32F-NEXT: vmv.v.x v13, t2 -; ZVE32F-NEXT: srli t2, t2, 32 -; ZVE32F-NEXT: vslide1down.vx v13, v13, t2 -; ZVE32F-NEXT: vslide1down.vx v14, v9, t0 -; ZVE32F-NEXT: ld t0, 64(a2) -; ZVE32F-NEXT: ld t2, 112(a2) -; ZVE32F-NEXT: vmv.v.x v9, a5 -; ZVE32F-NEXT: srli a5, a5, 32 -; ZVE32F-NEXT: vslide1down.vx v15, v9, a5 +; ZVE32F-NEXT: vslide1down.vx v13, v8, a7 +; ZVE32F-NEXT: ld a7, 0(a2) +; ZVE32F-NEXT: ld t0, 16(a2) +; ZVE32F-NEXT: ld t1, 32(a2) +; ZVE32F-NEXT: ld t2, 48(a2) +; ZVE32F-NEXT: vmv.v.x v8, a7 +; ZVE32F-NEXT: srli a7, a7, 32 +; ZVE32F-NEXT: vmv.v.x v9, t1 +; ZVE32F-NEXT: srli t1, t1, 32 +; ZVE32F-NEXT: vslide1down.vx v14, v9, t1 +; ZVE32F-NEXT: vslide1down.vx v15, v8, a7 +; ZVE32F-NEXT: ld a7, 64(a2) +; ZVE32F-NEXT: ld t1, 112(a2) +; ZVE32F-NEXT: vmv.v.x v8, a4 +; ZVE32F-NEXT: srli a4, a4, 32 +; ZVE32F-NEXT: vslide1down.vx v16, v8, a4 ; ZVE32F-NEXT: ld a2, 80(a2) -; ZVE32F-NEXT: vmv.v.x v9, t0 -; ZVE32F-NEXT: srli a5, t0, 32 -; ZVE32F-NEXT: vslide1down.vx v16, v9, a5 -; ZVE32F-NEXT: vslide1down.vx v9, v11, a1 +; ZVE32F-NEXT: vmv.v.x v8, a7 +; ZVE32F-NEXT: srli a4, a7, 32 +; ZVE32F-NEXT: vslide1down.vx v17, v8, a4 +; ZVE32F-NEXT: vslide1down.vx v8, v11, a1 ; ZVE32F-NEXT: srli a1, a1, 32 -; ZVE32F-NEXT: vslide1down.vx v9, v9, a1 -; ZVE32F-NEXT: vslide1down.vx v10, v10, a6 +; ZVE32F-NEXT: vslide1down.vx v9, v8, a1 +; ZVE32F-NEXT: vslide1down.vx v8, v10, a6 ; ZVE32F-NEXT: srli a1, a6, 32 -; ZVE32F-NEXT: vslide1down.vx v10, v10, a1 -; ZVE32F-NEXT: vslide1down.vx v8, v8, a4 -; ZVE32F-NEXT: srli a4, a4, 32 -; ZVE32F-NEXT: vslide1down.vx v11, v8, a4 -; ZVE32F-NEXT: vmv.v.i v0, 15 -; ZVE32F-NEXT: vslide1down.vx v8, v14, t1 -; ZVE32F-NEXT: srli a1, t1, 32 -; ZVE32F-NEXT: vslide1down.vx v14, v8, a1 -; ZVE32F-NEXT: vslidedown.vi v9, v10, 4, v0.t -; ZVE32F-NEXT: vslide1down.vx v8, v12, a3 +; ZVE32F-NEXT: vslide1down.vx v18, v8, a1 +; ZVE32F-NEXT: vslide1down.vx v8, v12, a5 +; ZVE32F-NEXT: srli a5, a5, 32 +; ZVE32F-NEXT: vslide1down.vx v8, v8, a5 +; ZVE32F-NEXT: vslide1down.vx v10, v13, a3 ; ZVE32F-NEXT: srli a3, a3, 32 -; ZVE32F-NEXT: vslide1down.vx v8, v8, a3 -; ZVE32F-NEXT: vslidedown.vi v8, v11, 4, v0.t -; ZVE32F-NEXT: vslide1down.vx v10, v13, a7 -; ZVE32F-NEXT: srli a1, a7, 32 -; ZVE32F-NEXT: vslide1down.vx v10, v10, a1 -; ZVE32F-NEXT: vslidedown.vi v10, v14, 4, v0.t -; ZVE32F-NEXT: vslide1down.vx v11, v15, t2 +; ZVE32F-NEXT: vslide1down.vx v12, v10, a3 +; ZVE32F-NEXT: vslide1down.vx v10, v14, t2 ; ZVE32F-NEXT: srli a1, t2, 32 +; ZVE32F-NEXT: vslide1down.vx v10, v10, a1 +; ZVE32F-NEXT: vslide1down.vx v11, v15, t0 +; ZVE32F-NEXT: srli a1, t0, 32 +; ZVE32F-NEXT: vslide1down.vx v13, v11, a1 +; ZVE32F-NEXT: vslide1down.vx v11, v16, t1 +; ZVE32F-NEXT: srli a1, t1, 32 ; ZVE32F-NEXT: vslide1down.vx v11, v11, a1 -; ZVE32F-NEXT: vslide1down.vx v12, v16, a2 +; ZVE32F-NEXT: vmv.v.i v0, 15 +; ZVE32F-NEXT: vslide1down.vx v14, v17, a2 ; ZVE32F-NEXT: srli a2, a2, 32 -; ZVE32F-NEXT: vslide1down.vx v12, v12, a2 -; ZVE32F-NEXT: vslidedown.vi v11, v12, 4, v0.t +; ZVE32F-NEXT: vslide1down.vx v14, v14, a2 +; ZVE32F-NEXT: vslidedown.vi v9, v18, 4, v0.t +; ZVE32F-NEXT: vslidedown.vi v8, v12, 4, v0.t +; ZVE32F-NEXT: vslidedown.vi v10, v13, 4, v0.t +; ZVE32F-NEXT: vslidedown.vi v11, v14, 4, v0.t ; ZVE32F-NEXT: vs4r.v v8, (a0) ; ZVE32F-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll index 9c6d77dde1b5c..020654bbff5eb 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll @@ -57,8 +57,8 @@ define <4 x i64> @m2_pair_swap_vl4(<4 x i64> %v1) vscale_range(2,2) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vslidedown.vi v11, v9, 1 -; CHECK-NEXT: vslideup.vi v11, v9, 1 ; CHECK-NEXT: vslidedown.vi v10, v8, 1 +; CHECK-NEXT: vslideup.vi v11, v9, 1 ; CHECK-NEXT: vslideup.vi v10, v8, 1 ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret @@ -291,8 +291,8 @@ define <4 x double> @shuffles_add(<4 x double> %0, <4 x double> %1) vscale_range ; CHECK-NEXT: vmv.v.i v0, 1 ; CHECK-NEXT: vslideup.vi v13, v11, 1 ; CHECK-NEXT: vslidedown.vi v11, v10, 1, v0.t -; CHECK-NEXT: vmv1r.v v10, v9 ; CHECK-NEXT: vrgather.vi v12, v9, 0 +; CHECK-NEXT: vmv1r.v v10, v9 ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vfadd.vv v8, v12, v10 ; CHECK-NEXT: ret @@ -324,8 +324,8 @@ define <16 x i32> @m4_linear_num_of_shuffles_in_chunks(<16 x i32> %0) vscale_ran ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vslideup.vi v12, v10, 2 -; CHECK-NEXT: vslideup.vi v12, v11, 3 ; CHECK-NEXT: vrgather.vi v14, v8, 2 +; CHECK-NEXT: vslideup.vi v12, v11, 3 ; CHECK-NEXT: vrgather.vi v15, v10, 3 ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int.ll index 5683476852683..0887f3cb3d6b2 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int.ll @@ -972,8 +972,8 @@ define <8 x i32> @shuffle_repeat3_singlesrc_e32(<8 x i32> %v) { ; CHECK-NEXT: vmerge.vim v9, v9, 0, v0 ; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: vmerge.vim v9, v9, 2, v0 ; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: vmerge.vim v9, v9, 2, v0 ; CHECK-NEXT: vslidedown.vx v10, v9, a0 ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma ; CHECK-NEXT: vrgatherei16.vv v11, v8, v10 @@ -1377,17 +1377,18 @@ define <8 x i64> @shuffle_v8i64_span_splat_neg(<8 x i64> %a) nounwind { ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vmv.v.i v9, 1 ; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v12, v8, v9 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vslidedown.vx v10, v9, a0 +; CHECK-NEXT: vslidedown.vx v9, v9, a0 ; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma -; CHECK-NEXT: vrgatherei16.vv v13, v8, v10 +; CHECK-NEXT: vrgatherei16.vv v13, v8, v9 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vslidedown.vx v10, v10, a0 +; CHECK-NEXT: vslidedown.vx v9, v9, a0 ; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma -; CHECK-NEXT: vrgatherei16.vv v12, v8, v9 -; CHECK-NEXT: vrgatherei16.vv v14, v8, v10 +; CHECK-NEXT: vrgatherei16.vv v14, v8, v9 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vslidedown.vx v9, v10, a0 +; CHECK-NEXT: vslidedown.vx v9, v9, a0 ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; CHECK-NEXT: vrgatherei16.vv v15, v8, v9 ; CHECK-NEXT: vmv4r.v v8, v12 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll index 06521a8739bac..ed1ecf7b50101 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll @@ -871,10 +871,10 @@ define <32 x i8> @reverse_v32i8_2(<16 x i8> %a, <16 x i8> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vrsub.vi v11, v10, 15 ; CHECK-NEXT: vrgather.vv v10, v8, v11 ; CHECK-NEXT: vrgather.vv v8, v9, v11 -; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma ; CHECK-NEXT: vslideup.vi v8, v10, 16 ; CHECK-NEXT: ret @@ -943,18 +943,18 @@ define <32 x i16> @reverse_v32i16_2(<16 x i16> %a, <16 x i16> %b) { ; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma ; CHECK-NEXT: vid.v v12 ; CHECK-NEXT: srli a1, a0, 1 -; CHECK-NEXT: addi a0, a0, -16 ; CHECK-NEXT: addi a1, a1, -1 -; CHECK-NEXT: vrsub.vx v16, v12, a1 -; CHECK-NEXT: vrgather.vv v13, v8, v16 -; CHECK-NEXT: vrgather.vv v15, v10, v16 -; CHECK-NEXT: vrgather.vv v12, v9, v16 -; CHECK-NEXT: vrgather.vv v14, v11, v16 +; CHECK-NEXT: vrsub.vx v14, v12, a1 +; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: addi a0, a0, -16 +; CHECK-NEXT: vrgather.vv v13, v8, v14 +; CHECK-NEXT: vrgather.vv v12, v9, v14 +; CHECK-NEXT: vrgather.vv v9, v10, v14 +; CHECK-NEXT: vrgather.vv v8, v11, v14 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vslidedown.vx v12, v12, a0 -; CHECK-NEXT: vslidedown.vx v8, v14, a0 -; CHECK-NEXT: li a0, 32 -; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma ; CHECK-NEXT: vslideup.vi v8, v12, 16 ; CHECK-NEXT: ret %res = shufflevector <16 x i16> %a, <16 x i16> %b, <32 x i32> @@ -1010,14 +1010,14 @@ define <16 x i32> @reverse_v16i32_2(<8 x i32> %a, <8 x i32> %b) { ; CHECK-NEXT: srli a0, a0, 1 ; CHECK-NEXT: addi a1, a1, -1 ; CHECK-NEXT: addi a0, a0, -8 -; CHECK-NEXT: vrsub.vx v16, v12, a1 -; CHECK-NEXT: vrgather.vv v13, v8, v16 -; CHECK-NEXT: vrgather.vv v15, v10, v16 -; CHECK-NEXT: vrgather.vv v12, v9, v16 -; CHECK-NEXT: vrgather.vv v14, v11, v16 +; CHECK-NEXT: vrsub.vx v14, v12, a1 +; CHECK-NEXT: vrgather.vv v13, v8, v14 +; CHECK-NEXT: vrgather.vv v12, v9, v14 +; CHECK-NEXT: vrgather.vv v9, v10, v14 +; CHECK-NEXT: vrgather.vv v8, v11, v14 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vslidedown.vx v12, v12, a0 -; CHECK-NEXT: vslidedown.vx v8, v14, a0 +; CHECK-NEXT: vslidedown.vx v8, v8, a0 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vslideup.vi v8, v12, 8 ; CHECK-NEXT: ret @@ -1032,22 +1032,22 @@ define <32 x i32> @reverse_v32i32_2(<16 x i32> %a, <16 x i32> %b) { ; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma ; CHECK-NEXT: vid.v v16 ; CHECK-NEXT: srli a1, a0, 2 -; CHECK-NEXT: addi a0, a0, -16 ; CHECK-NEXT: addi a1, a1, -1 -; CHECK-NEXT: vrsub.vx v24, v16, a1 -; CHECK-NEXT: vrgather.vv v19, v8, v24 -; CHECK-NEXT: vrgather.vv v23, v12, v24 -; CHECK-NEXT: vrgather.vv v18, v9, v24 -; CHECK-NEXT: vrgather.vv v22, v13, v24 -; CHECK-NEXT: vrgather.vv v17, v10, v24 -; CHECK-NEXT: vrgather.vv v21, v14, v24 -; CHECK-NEXT: vrgather.vv v16, v11, v24 -; CHECK-NEXT: vrgather.vv v20, v15, v24 +; CHECK-NEXT: vrsub.vx v20, v16, a1 +; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: addi a0, a0, -16 +; CHECK-NEXT: vrgather.vv v19, v8, v20 +; CHECK-NEXT: vrgather.vv v18, v9, v20 +; CHECK-NEXT: vrgather.vv v17, v10, v20 +; CHECK-NEXT: vrgather.vv v16, v11, v20 +; CHECK-NEXT: vrgather.vv v11, v12, v20 +; CHECK-NEXT: vrgather.vv v10, v13, v20 +; CHECK-NEXT: vrgather.vv v9, v14, v20 +; CHECK-NEXT: vrgather.vv v8, v15, v20 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vslidedown.vx v16, v16, a0 -; CHECK-NEXT: vslidedown.vx v8, v20, a0 -; CHECK-NEXT: li a0, 32 -; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; CHECK-NEXT: vslideup.vi v8, v16, 16 ; CHECK-NEXT: ret %res = shufflevector <16 x i32> %a, <16 x i32> %b, <32 x i32> @@ -1079,14 +1079,14 @@ define <8 x i64> @reverse_v8i64_2(<4 x i64> %a, <4 x i64> %b) { ; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: addi a1, a1, -1 ; CHECK-NEXT: addi a0, a0, -4 -; CHECK-NEXT: vrsub.vx v16, v12, a1 -; CHECK-NEXT: vrgather.vv v13, v8, v16 -; CHECK-NEXT: vrgather.vv v15, v10, v16 -; CHECK-NEXT: vrgather.vv v12, v9, v16 -; CHECK-NEXT: vrgather.vv v14, v11, v16 +; CHECK-NEXT: vrsub.vx v14, v12, a1 +; CHECK-NEXT: vrgather.vv v13, v8, v14 +; CHECK-NEXT: vrgather.vv v12, v9, v14 +; CHECK-NEXT: vrgather.vv v9, v10, v14 +; CHECK-NEXT: vrgather.vv v8, v11, v14 ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vslidedown.vx v12, v12, a0 -; CHECK-NEXT: vslidedown.vx v8, v14, a0 +; CHECK-NEXT: vslidedown.vx v8, v8, a0 ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; CHECK-NEXT: vslideup.vi v8, v12, 4 ; CHECK-NEXT: ret @@ -1156,13 +1156,13 @@ define <32 x half> @reverse_v32f16_2(<16 x half> %a) { ; CHECK-NEXT: vid.v v10 ; CHECK-NEXT: srli a1, a0, 1 ; CHECK-NEXT: addi a1, a1, -1 -; CHECK-NEXT: vrsub.vx v12, v10, a1 -; CHECK-NEXT: vrgather.vv v11, v8, v12 -; CHECK-NEXT: vrgather.vv v10, v9, v12 +; CHECK-NEXT: vrsub.vx v13, v10, a1 ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: slli a0, a0, 1 -; CHECK-NEXT: vrgather.vv v8, v9, v12 ; CHECK-NEXT: addi a0, a0, -32 +; CHECK-NEXT: vrgather.vv v11, v8, v13 +; CHECK-NEXT: vrgather.vv v8, v12, v13 +; CHECK-NEXT: vrgather.vv v10, v9, v13 ; CHECK-NEXT: vmv.v.v v9, v8 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma ; CHECK-NEXT: vslidedown.vx v8, v8, a0 @@ -1220,14 +1220,14 @@ define <16 x float> @reverse_v16f32_2(<8 x float> %a, <8 x float> %b) { ; CHECK-NEXT: srli a0, a0, 1 ; CHECK-NEXT: addi a1, a1, -1 ; CHECK-NEXT: addi a0, a0, -8 -; CHECK-NEXT: vrsub.vx v16, v12, a1 -; CHECK-NEXT: vrgather.vv v13, v8, v16 -; CHECK-NEXT: vrgather.vv v15, v10, v16 -; CHECK-NEXT: vrgather.vv v12, v9, v16 -; CHECK-NEXT: vrgather.vv v14, v11, v16 +; CHECK-NEXT: vrsub.vx v14, v12, a1 +; CHECK-NEXT: vrgather.vv v13, v8, v14 +; CHECK-NEXT: vrgather.vv v12, v9, v14 +; CHECK-NEXT: vrgather.vv v9, v10, v14 +; CHECK-NEXT: vrgather.vv v8, v11, v14 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vslidedown.vx v12, v12, a0 -; CHECK-NEXT: vslidedown.vx v8, v14, a0 +; CHECK-NEXT: vslidedown.vx v8, v8, a0 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vslideup.vi v8, v12, 8 ; CHECK-NEXT: ret @@ -1260,14 +1260,14 @@ define <8 x double> @reverse_v8f64_2(<4 x double> %a, <4 x double> %b) { ; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: addi a1, a1, -1 ; CHECK-NEXT: addi a0, a0, -4 -; CHECK-NEXT: vrsub.vx v16, v12, a1 -; CHECK-NEXT: vrgather.vv v13, v8, v16 -; CHECK-NEXT: vrgather.vv v15, v10, v16 -; CHECK-NEXT: vrgather.vv v12, v9, v16 -; CHECK-NEXT: vrgather.vv v14, v11, v16 +; CHECK-NEXT: vrsub.vx v14, v12, a1 +; CHECK-NEXT: vrgather.vv v13, v8, v14 +; CHECK-NEXT: vrgather.vv v12, v9, v14 +; CHECK-NEXT: vrgather.vv v9, v10, v14 +; CHECK-NEXT: vrgather.vv v8, v11, v14 ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vslidedown.vx v12, v12, a0 -; CHECK-NEXT: vslidedown.vx v8, v14, a0 +; CHECK-NEXT: vslidedown.vx v8, v8, a0 ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; CHECK-NEXT: vslideup.vi v8, v12, 4 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll index c76aa7c4d317d..85e769c86974e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll @@ -895,10 +895,10 @@ define <8 x float> @shuffle_v8f32_as_i64_exact(<8 x float> %v) vscale_range(2,2) ; ZVKB-ZVE32X-LABEL: shuffle_v8f32_as_i64_exact: ; ZVKB-ZVE32X: # %bb.0: ; ZVKB-ZVE32X-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; ZVKB-ZVE32X-NEXT: vmv.v.i v0, 10 ; ZVKB-ZVE32X-NEXT: vslidedown.vi v11, v9, 1 -; ZVKB-ZVE32X-NEXT: vslideup.vi v11, v9, 1, v0.t +; ZVKB-ZVE32X-NEXT: vmv.v.i v0, 10 ; ZVKB-ZVE32X-NEXT: vslidedown.vi v10, v8, 1 +; ZVKB-ZVE32X-NEXT: vslideup.vi v11, v9, 1, v0.t ; ZVKB-ZVE32X-NEXT: vslideup.vi v10, v8, 1, v0.t ; ZVKB-ZVE32X-NEXT: vmv2r.v v8, v10 ; ZVKB-ZVE32X-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll index ddde1e94abbde..a526974f77730 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll @@ -255,14 +255,14 @@ define void @strided_runtime_mismatch_4xv4i16(ptr %x, ptr %z, i64 %s, i64 %t) { ; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; RV32-NEXT: vle16.v v8, (a0) ; RV32-NEXT: add a0, a0, a2 -; RV32-NEXT: add a4, a0, a4 -; RV32-NEXT: vle16.v v10, (a4) -; RV32-NEXT: add a2, a4, a2 -; RV32-NEXT: vle16.v v9, (a2) +; RV32-NEXT: vle16.v v9, (a0) +; RV32-NEXT: add a0, a0, a4 +; RV32-NEXT: vle16.v v10, (a0) +; RV32-NEXT: add a0, a0, a2 ; RV32-NEXT: vle16.v v11, (a0) ; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vslideup.vi v10, v9, 4 -; RV32-NEXT: vslideup.vi v8, v11, 4 +; RV32-NEXT: vslideup.vi v8, v9, 4 +; RV32-NEXT: vslideup.vi v10, v11, 4 ; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV32-NEXT: vslideup.vi v8, v10, 8 ; RV32-NEXT: vse16.v v8, (a1) @@ -273,14 +273,14 @@ define void @strided_runtime_mismatch_4xv4i16(ptr %x, ptr %z, i64 %s, i64 %t) { ; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; RV64-NEXT: vle16.v v8, (a0) ; RV64-NEXT: add a0, a0, a2 -; RV64-NEXT: add a3, a0, a3 -; RV64-NEXT: vle16.v v10, (a3) -; RV64-NEXT: add a2, a3, a2 -; RV64-NEXT: vle16.v v9, (a2) +; RV64-NEXT: vle16.v v9, (a0) +; RV64-NEXT: add a0, a0, a3 +; RV64-NEXT: vle16.v v10, (a0) +; RV64-NEXT: add a0, a0, a2 ; RV64-NEXT: vle16.v v11, (a0) ; RV64-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV64-NEXT: vslideup.vi v10, v9, 4 -; RV64-NEXT: vslideup.vi v8, v11, 4 +; RV64-NEXT: vslideup.vi v8, v9, 4 +; RV64-NEXT: vslideup.vi v10, v11, 4 ; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV64-NEXT: vslideup.vi v8, v10, 8 ; RV64-NEXT: vse16.v v8, (a1) @@ -291,14 +291,14 @@ define void @strided_runtime_mismatch_4xv4i16(ptr %x, ptr %z, i64 %s, i64 %t) { ; ZVE64F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVE64F-NEXT: vle16.v v8, (a0) ; ZVE64F-NEXT: add a0, a0, a2 -; ZVE64F-NEXT: add a3, a0, a3 -; ZVE64F-NEXT: vle16.v v10, (a3) -; ZVE64F-NEXT: add a2, a3, a2 -; ZVE64F-NEXT: vle16.v v9, (a2) +; ZVE64F-NEXT: vle16.v v9, (a0) +; ZVE64F-NEXT: add a0, a0, a3 +; ZVE64F-NEXT: vle16.v v10, (a0) +; ZVE64F-NEXT: add a0, a0, a2 ; ZVE64F-NEXT: vle16.v v11, (a0) ; ZVE64F-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVE64F-NEXT: vslideup.vi v10, v9, 4 -; ZVE64F-NEXT: vslideup.vi v8, v11, 4 +; ZVE64F-NEXT: vslideup.vi v8, v9, 4 +; ZVE64F-NEXT: vslideup.vi v10, v11, 4 ; ZVE64F-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; ZVE64F-NEXT: vslideup.vi v8, v10, 8 ; ZVE64F-NEXT: vse16.v v8, (a1) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll index 461b4d0e02cb8..62446af09ca2d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll @@ -68,11 +68,11 @@ define <128 x i7> @vtrunc_v128i7_v128i16(<128 x i16> %a, <128 x i1> %m, i32 zero ; CHECK-NEXT: sltu a0, a0, a2 ; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: and a0, a0, a2 +; CHECK-NEXT: li a2, 128 ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma ; CHECK-NEXT: vnsrl.wi v24, v16, 0, v0.t -; CHECK-NEXT: li a0, 128 -; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma ; CHECK-NEXT: vslideup.vx v8, v24, a1 ; CHECK-NEXT: ret %v = call <128 x i7> @llvm.vp.trunc.v128i7.v128i16(<128 x i16> %a, <128 x i1> %m, i32 %vl) @@ -272,14 +272,14 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze ; RV32-NEXT: # %bb.3: ; RV32-NEXT: li a6, 64 ; RV32-NEXT: .LBB16_4: -; RV32-NEXT: addi t2, a1, 128 +; RV32-NEXT: addi t3, a1, 128 ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV32-NEXT: vslidedown.vi v6, v4, 2 -; RV32-NEXT: addi t6, a1, 512 +; RV32-NEXT: addi s0, a1, 512 ; RV32-NEXT: addi t5, a1, 640 ; RV32-NEXT: vslidedown.vi v0, v3, 2 ; RV32-NEXT: addi t1, t1, -1 -; RV32-NEXT: addi t3, a1, 384 +; RV32-NEXT: addi t2, a1, 384 ; RV32-NEXT: vslidedown.vi v2, v5, 2 ; RV32-NEXT: li a3, 32 ; RV32-NEXT: addi t4, a6, -32 @@ -287,39 +287,44 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze ; RV32-NEXT: addi a6, a6, -1 ; RV32-NEXT: and a6, a6, t4 ; RV32-NEXT: addi t4, a6, -16 -; RV32-NEXT: sltu s0, a6, t4 -; RV32-NEXT: addi s0, s0, -1 +; RV32-NEXT: sltu t6, a6, t4 +; RV32-NEXT: addi t6, t6, -1 ; RV32-NEXT: bltu a6, a2, .LBB16_6 ; RV32-NEXT: # %bb.5: ; RV32-NEXT: li a6, 16 ; RV32-NEXT: .LBB16_6: ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vle64.v v8, (t6) -; RV32-NEXT: csrr t6, vlenb +; RV32-NEXT: vle64.v v8, (s0) +; RV32-NEXT: csrr s0, vlenb ; RV32-NEXT: sw a0, 4(sp) # 4-byte Folded Spill ; RV32-NEXT: li a0, 56 -; RV32-NEXT: mul t6, t6, a0 +; RV32-NEXT: mul s0, s0, a0 ; RV32-NEXT: lw a0, 4(sp) # 4-byte Folded Reload -; RV32-NEXT: add t6, sp, t6 -; RV32-NEXT: addi t6, t6, 16 -; RV32-NEXT: vs8r.v v8, (t6) # vscale x 64-byte Folded Spill -; RV32-NEXT: vle64.v v8, (t5) -; RV32-NEXT: vle64.v v16, (t2) +; RV32-NEXT: add s0, sp, s0 +; RV32-NEXT: addi s0, s0, 16 +; RV32-NEXT: vs8r.v v8, (s0) # vscale x 64-byte Folded Spill +; RV32-NEXT: vle64.v v16, (t5) +; RV32-NEXT: vle64.v v8, (t3) +; RV32-NEXT: csrr t3, vlenb +; RV32-NEXT: slli t3, t3, 3 +; RV32-NEXT: add t3, sp, t3 +; RV32-NEXT: addi t3, t3, 16 +; RV32-NEXT: vs8r.v v8, (t3) # vscale x 64-byte Folded Spill ; RV32-NEXT: vle64.v v24, (a1) -; RV32-NEXT: csrr t2, vlenb +; RV32-NEXT: csrr t3, vlenb ; RV32-NEXT: li t5, 48 -; RV32-NEXT: mul t2, t2, t5 -; RV32-NEXT: add t2, sp, t2 -; RV32-NEXT: addi t2, t2, 16 -; RV32-NEXT: vs8r.v v24, (t2) # vscale x 64-byte Folded Spill -; RV32-NEXT: vle64.v v24, (t3) +; RV32-NEXT: mul t3, t3, t5 +; RV32-NEXT: add t3, sp, t3 +; RV32-NEXT: addi t3, t3, 16 +; RV32-NEXT: vs8r.v v24, (t3) # vscale x 64-byte Folded Spill +; RV32-NEXT: vle64.v v24, (t2) ; RV32-NEXT: csrr t2, vlenb -; RV32-NEXT: slli t2, t2, 3 +; RV32-NEXT: slli t2, t2, 4 ; RV32-NEXT: add t2, sp, t2 ; RV32-NEXT: addi t2, t2, 16 ; RV32-NEXT: vs8r.v v24, (t2) # vscale x 64-byte Folded Spill ; RV32-NEXT: and t2, t1, t0 -; RV32-NEXT: and t1, s0, t4 +; RV32-NEXT: and t1, t6, t4 ; RV32-NEXT: addi a1, a1, 256 ; RV32-NEXT: mv t0, a4 ; RV32-NEXT: bltu a4, a3, .LBB16_8 @@ -327,45 +332,50 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze ; RV32-NEXT: li t0, 32 ; RV32-NEXT: .LBB16_8: ; RV32-NEXT: vsetvli zero, t2, e32, m4, ta, ma -; RV32-NEXT: vnsrl.wi v24, v8, 0, v0.t -; RV32-NEXT: csrr t2, vlenb -; RV32-NEXT: li t3, 24 -; RV32-NEXT: mul t2, t2, t3 -; RV32-NEXT: add t2, sp, t2 -; RV32-NEXT: addi t2, t2, 16 -; RV32-NEXT: vs8r.v v24, (t2) # vscale x 64-byte Folded Spill +; RV32-NEXT: vnsrl.wi v8, v16, 0, v0.t +; RV32-NEXT: addi t2, sp, 16 +; RV32-NEXT: vs8r.v v8, (t2) # vscale x 64-byte Folded Spill ; RV32-NEXT: vmv1r.v v0, v3 ; RV32-NEXT: csrr t2, vlenb ; RV32-NEXT: li t3, 56 ; RV32-NEXT: mul t2, t2, t3 ; RV32-NEXT: add t2, sp, t2 ; RV32-NEXT: addi t2, t2, 16 -; RV32-NEXT: vl8r.v v24, (t2) # vscale x 64-byte Folded Reload +; RV32-NEXT: vl8r.v v8, (t2) # vscale x 64-byte Folded Reload ; RV32-NEXT: vsetvli zero, a5, e32, m4, ta, ma -; RV32-NEXT: vnsrl.wi v8, v24, 0, v0.t +; RV32-NEXT: vnsrl.wi v16, v8, 0, v0.t ; RV32-NEXT: csrr a5, vlenb ; RV32-NEXT: slli a5, a5, 6 ; RV32-NEXT: add a5, sp, a5 ; RV32-NEXT: addi a5, a5, 16 -; RV32-NEXT: vs8r.v v8, (a5) # vscale x 64-byte Folded Spill +; RV32-NEXT: vs8r.v v16, (a5) # vscale x 64-byte Folded Spill ; RV32-NEXT: vmv1r.v v0, v6 +; RV32-NEXT: csrr a5, vlenb +; RV32-NEXT: slli a5, a5, 3 +; RV32-NEXT: add a5, sp, a5 +; RV32-NEXT: addi a5, a5, 16 +; RV32-NEXT: vl8r.v v8, (a5) # vscale x 64-byte Folded Reload ; RV32-NEXT: vsetvli zero, t1, e32, m4, ta, ma -; RV32-NEXT: vnsrl.wi v8, v16, 0, v0.t +; RV32-NEXT: vnsrl.wi v16, v8, 0, v0.t ; RV32-NEXT: csrr a5, vlenb -; RV32-NEXT: slli a5, a5, 4 +; RV32-NEXT: li t1, 24 +; RV32-NEXT: mul a5, a5, t1 ; RV32-NEXT: add a5, sp, a5 ; RV32-NEXT: addi a5, a5, 16 -; RV32-NEXT: vs8r.v v8, (a5) # vscale x 64-byte Folded Spill +; RV32-NEXT: vs8r.v v16, (a5) # vscale x 64-byte Folded Spill ; RV32-NEXT: addi a5, t0, -16 ; RV32-NEXT: sltu t0, t0, a5 ; RV32-NEXT: addi t0, t0, -1 ; RV32-NEXT: and a5, t0, a5 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vle64.v v8, (a1) -; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v30, v7, 2 +; RV32-NEXT: vslidedown.vi v12, v7, 2 ; RV32-NEXT: vmv1r.v v0, v4 ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: li t0, 48 @@ -383,95 +393,85 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze ; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill ; RV32-NEXT: vmv1r.v v0, v2 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: slli a1, a1, 4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload +; RV32-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload ; RV32-NEXT: vsetvli zero, a5, e32, m4, ta, ma -; RV32-NEXT: vnsrl.wi v16, v8, 0, v0.t +; RV32-NEXT: vnsrl.wi v8, v16, 0, v0.t ; RV32-NEXT: bltu a4, a2, .LBB16_10 ; RV32-NEXT: # %bb.9: ; RV32-NEXT: li a4, 16 ; RV32-NEXT: .LBB16_10: ; RV32-NEXT: vmv1r.v v0, v5 -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload ; RV32-NEXT: vsetvli zero, a4, e32, m4, ta, ma -; RV32-NEXT: vnsrl.wi v24, v8, 0, v0.t +; RV32-NEXT: vnsrl.wi v16, v24, 0, v0.t ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: li a4, 48 ; RV32-NEXT: mul a1, a1, a4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill +; RV32-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill ; RV32-NEXT: mv a1, a7 ; RV32-NEXT: bltu a7, a3, .LBB16_12 ; RV32-NEXT: # %bb.11: ; RV32-NEXT: li a1, 32 ; RV32-NEXT: .LBB16_12: -; RV32-NEXT: vmv1r.v v0, v30 +; RV32-NEXT: vmv1r.v v0, v12 +; RV32-NEXT: addi a4, sp, 16 +; RV32-NEXT: vl8r.v v16, (a4) # vscale x 64-byte Folded Reload +; RV32-NEXT: vmv4r.v v24, v16 +; RV32-NEXT: csrr a4, vlenb +; RV32-NEXT: slli a4, a4, 6 +; RV32-NEXT: add a4, sp, a4 +; RV32-NEXT: addi a4, a4, 16 +; RV32-NEXT: vl8r.v v16, (a4) # vscale x 64-byte Folded Reload +; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; RV32-NEXT: vslideup.vi v16, v24, 16 +; RV32-NEXT: csrr a4, vlenb +; RV32-NEXT: slli a4, a4, 6 +; RV32-NEXT: add a4, sp, a4 +; RV32-NEXT: addi a4, a4, 16 +; RV32-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill ; RV32-NEXT: csrr a4, vlenb ; RV32-NEXT: li a5, 24 ; RV32-NEXT: mul a4, a4, a5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vl8r.v v8, (a4) # vscale x 64-byte Folded Reload +; RV32-NEXT: vl8r.v v16, (a4) # vscale x 64-byte Folded Reload +; RV32-NEXT: vmv4r.v v24, v16 ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 4 +; RV32-NEXT: li a5, 56 +; RV32-NEXT: mul a4, a4, a5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vl8r.v v24, (a4) # vscale x 64-byte Folded Reload +; RV32-NEXT: vl8r.v v16, (a4) # vscale x 64-byte Folded Reload +; RV32-NEXT: vslideup.vi v16, v24, 16 ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 24 +; RV32-NEXT: li a5, 56 ; RV32-NEXT: mul a4, a4, a5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 ; RV32-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 6 +; RV32-NEXT: li a5, 48 +; RV32-NEXT: mul a4, a4, a5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 ; RV32-NEXT: vl8r.v v16, (a4) # vscale x 64-byte Folded Reload -; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma ; RV32-NEXT: vslideup.vi v16, v8, 16 ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 6 +; RV32-NEXT: li a5, 48 +; RV32-NEXT: mul a4, a4, a5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 ; RV32-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill ; RV32-NEXT: addi a4, a1, -16 -; RV32-NEXT: csrr a5, vlenb -; RV32-NEXT: li a6, 56 -; RV32-NEXT: mul a5, a5, a6 -; RV32-NEXT: add a5, sp, a5 -; RV32-NEXT: addi a5, a5, 16 -; RV32-NEXT: vl8r.v v16, (a5) # vscale x 64-byte Folded Reload -; RV32-NEXT: vslideup.vi v16, v24, 16 -; RV32-NEXT: csrr a5, vlenb -; RV32-NEXT: li a6, 56 -; RV32-NEXT: mul a5, a5, a6 -; RV32-NEXT: add a5, sp, a5 -; RV32-NEXT: addi a5, a5, 16 -; RV32-NEXT: vs8r.v v16, (a5) # vscale x 64-byte Folded Spill -; RV32-NEXT: csrr a5, vlenb -; RV32-NEXT: li a6, 48 -; RV32-NEXT: mul a5, a5, a6 -; RV32-NEXT: add a5, sp, a5 -; RV32-NEXT: addi a5, a5, 16 -; RV32-NEXT: vl8r.v v16, (a5) # vscale x 64-byte Folded Reload -; RV32-NEXT: csrr a5, vlenb -; RV32-NEXT: li a6, 24 -; RV32-NEXT: mul a5, a5, a6 -; RV32-NEXT: add a5, sp, a5 -; RV32-NEXT: addi a5, a5, 16 -; RV32-NEXT: vl8r.v v8, (a5) # vscale x 64-byte Folded Reload -; RV32-NEXT: vslideup.vi v16, v8, 16 -; RV32-NEXT: csrr a5, vlenb -; RV32-NEXT: li a6, 48 -; RV32-NEXT: mul a5, a5, a6 -; RV32-NEXT: add a5, sp, a5 -; RV32-NEXT: addi a5, a5, 16 -; RV32-NEXT: vs8r.v v16, (a5) # vscale x 64-byte Folded Spill ; RV32-NEXT: sltu a1, a1, a4 ; RV32-NEXT: addi a1, a1, -1 ; RV32-NEXT: and a1, a1, a4 @@ -492,12 +492,12 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload +; RV32-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload ; RV32-NEXT: vsetvli zero, a7, e32, m4, ta, ma -; RV32-NEXT: vnsrl.wi v24, v16, 0, v0.t +; RV32-NEXT: vnsrl.wi v16, v24, 0, v0.t ; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV32-NEXT: vslideup.vi v24, v8, 16 -; RV32-NEXT: vse32.v v24, (a0) +; RV32-NEXT: vslideup.vi v16, v8, 16 +; RV32-NEXT: vse32.v v16, (a0) ; RV32-NEXT: addi a1, a0, 256 ; RV32-NEXT: csrr a2, vlenb ; RV32-NEXT: li a3, 48 @@ -582,14 +582,14 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze ; RV64-NEXT: # %bb.3: ; RV64-NEXT: li a6, 64 ; RV64-NEXT: .LBB16_4: -; RV64-NEXT: addi t2, a1, 128 +; RV64-NEXT: addi t3, a1, 128 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64-NEXT: vslidedown.vi v6, v4, 2 -; RV64-NEXT: addi t6, a1, 512 +; RV64-NEXT: addi s0, a1, 512 ; RV64-NEXT: addi t5, a1, 640 ; RV64-NEXT: vslidedown.vi v0, v3, 2 ; RV64-NEXT: addi t1, t1, -1 -; RV64-NEXT: addi t3, a1, 384 +; RV64-NEXT: addi t2, a1, 384 ; RV64-NEXT: vslidedown.vi v2, v5, 2 ; RV64-NEXT: li a3, 32 ; RV64-NEXT: addi t4, a6, -32 @@ -597,39 +597,44 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze ; RV64-NEXT: addi a6, a6, -1 ; RV64-NEXT: and a6, a6, t4 ; RV64-NEXT: addi t4, a6, -16 -; RV64-NEXT: sltu s0, a6, t4 -; RV64-NEXT: addi s0, s0, -1 +; RV64-NEXT: sltu t6, a6, t4 +; RV64-NEXT: addi t6, t6, -1 ; RV64-NEXT: bltu a6, a2, .LBB16_6 ; RV64-NEXT: # %bb.5: ; RV64-NEXT: li a6, 16 ; RV64-NEXT: .LBB16_6: ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vle64.v v8, (t6) -; RV64-NEXT: csrr t6, vlenb +; RV64-NEXT: vle64.v v8, (s0) +; RV64-NEXT: csrr s0, vlenb ; RV64-NEXT: sd a0, 8(sp) # 8-byte Folded Spill ; RV64-NEXT: li a0, 56 -; RV64-NEXT: mul t6, t6, a0 +; RV64-NEXT: mul s0, s0, a0 ; RV64-NEXT: ld a0, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: add t6, sp, t6 -; RV64-NEXT: addi t6, t6, 32 -; RV64-NEXT: vs8r.v v8, (t6) # vscale x 64-byte Folded Spill -; RV64-NEXT: vle64.v v8, (t5) -; RV64-NEXT: vle64.v v16, (t2) +; RV64-NEXT: add s0, sp, s0 +; RV64-NEXT: addi s0, s0, 32 +; RV64-NEXT: vs8r.v v8, (s0) # vscale x 64-byte Folded Spill +; RV64-NEXT: vle64.v v16, (t5) +; RV64-NEXT: vle64.v v8, (t3) +; RV64-NEXT: csrr t3, vlenb +; RV64-NEXT: slli t3, t3, 3 +; RV64-NEXT: add t3, sp, t3 +; RV64-NEXT: addi t3, t3, 32 +; RV64-NEXT: vs8r.v v8, (t3) # vscale x 64-byte Folded Spill ; RV64-NEXT: vle64.v v24, (a1) -; RV64-NEXT: csrr t2, vlenb +; RV64-NEXT: csrr t3, vlenb ; RV64-NEXT: li t5, 48 -; RV64-NEXT: mul t2, t2, t5 -; RV64-NEXT: add t2, sp, t2 -; RV64-NEXT: addi t2, t2, 32 -; RV64-NEXT: vs8r.v v24, (t2) # vscale x 64-byte Folded Spill -; RV64-NEXT: vle64.v v24, (t3) +; RV64-NEXT: mul t3, t3, t5 +; RV64-NEXT: add t3, sp, t3 +; RV64-NEXT: addi t3, t3, 32 +; RV64-NEXT: vs8r.v v24, (t3) # vscale x 64-byte Folded Spill +; RV64-NEXT: vle64.v v24, (t2) ; RV64-NEXT: csrr t2, vlenb -; RV64-NEXT: slli t2, t2, 3 +; RV64-NEXT: slli t2, t2, 4 ; RV64-NEXT: add t2, sp, t2 ; RV64-NEXT: addi t2, t2, 32 ; RV64-NEXT: vs8r.v v24, (t2) # vscale x 64-byte Folded Spill ; RV64-NEXT: and t2, t1, t0 -; RV64-NEXT: and t1, s0, t4 +; RV64-NEXT: and t1, t6, t4 ; RV64-NEXT: addi a1, a1, 256 ; RV64-NEXT: mv t0, a4 ; RV64-NEXT: bltu a4, a3, .LBB16_8 @@ -637,45 +642,50 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze ; RV64-NEXT: li t0, 32 ; RV64-NEXT: .LBB16_8: ; RV64-NEXT: vsetvli zero, t2, e32, m4, ta, ma -; RV64-NEXT: vnsrl.wi v24, v8, 0, v0.t -; RV64-NEXT: csrr t2, vlenb -; RV64-NEXT: li t3, 24 -; RV64-NEXT: mul t2, t2, t3 -; RV64-NEXT: add t2, sp, t2 -; RV64-NEXT: addi t2, t2, 32 -; RV64-NEXT: vs8r.v v24, (t2) # vscale x 64-byte Folded Spill +; RV64-NEXT: vnsrl.wi v8, v16, 0, v0.t +; RV64-NEXT: addi t2, sp, 32 +; RV64-NEXT: vs8r.v v8, (t2) # vscale x 64-byte Folded Spill ; RV64-NEXT: vmv1r.v v0, v3 ; RV64-NEXT: csrr t2, vlenb ; RV64-NEXT: li t3, 56 ; RV64-NEXT: mul t2, t2, t3 ; RV64-NEXT: add t2, sp, t2 ; RV64-NEXT: addi t2, t2, 32 -; RV64-NEXT: vl8r.v v24, (t2) # vscale x 64-byte Folded Reload +; RV64-NEXT: vl8r.v v8, (t2) # vscale x 64-byte Folded Reload ; RV64-NEXT: vsetvli zero, a5, e32, m4, ta, ma -; RV64-NEXT: vnsrl.wi v8, v24, 0, v0.t +; RV64-NEXT: vnsrl.wi v16, v8, 0, v0.t ; RV64-NEXT: csrr a5, vlenb ; RV64-NEXT: slli a5, a5, 6 ; RV64-NEXT: add a5, sp, a5 ; RV64-NEXT: addi a5, a5, 32 -; RV64-NEXT: vs8r.v v8, (a5) # vscale x 64-byte Folded Spill +; RV64-NEXT: vs8r.v v16, (a5) # vscale x 64-byte Folded Spill ; RV64-NEXT: vmv1r.v v0, v6 +; RV64-NEXT: csrr a5, vlenb +; RV64-NEXT: slli a5, a5, 3 +; RV64-NEXT: add a5, sp, a5 +; RV64-NEXT: addi a5, a5, 32 +; RV64-NEXT: vl8r.v v8, (a5) # vscale x 64-byte Folded Reload ; RV64-NEXT: vsetvli zero, t1, e32, m4, ta, ma -; RV64-NEXT: vnsrl.wi v8, v16, 0, v0.t +; RV64-NEXT: vnsrl.wi v16, v8, 0, v0.t ; RV64-NEXT: csrr a5, vlenb -; RV64-NEXT: slli a5, a5, 4 +; RV64-NEXT: li t1, 24 +; RV64-NEXT: mul a5, a5, t1 ; RV64-NEXT: add a5, sp, a5 ; RV64-NEXT: addi a5, a5, 32 -; RV64-NEXT: vs8r.v v8, (a5) # vscale x 64-byte Folded Spill +; RV64-NEXT: vs8r.v v16, (a5) # vscale x 64-byte Folded Spill ; RV64-NEXT: addi a5, t0, -16 ; RV64-NEXT: sltu t0, t0, a5 ; RV64-NEXT: addi t0, t0, -1 ; RV64-NEXT: and a5, t0, a5 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vle64.v v8, (a1) -; RV64-NEXT: addi a1, sp, 32 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 32 ; RV64-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64-NEXT: vslidedown.vi v30, v7, 2 +; RV64-NEXT: vslidedown.vi v12, v7, 2 ; RV64-NEXT: vmv1r.v v0, v4 ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: li t0, 48 @@ -693,95 +703,85 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze ; RV64-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill ; RV64-NEXT: vmv1r.v v0, v2 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: slli a1, a1, 4 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 32 -; RV64-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload +; RV64-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload ; RV64-NEXT: vsetvli zero, a5, e32, m4, ta, ma -; RV64-NEXT: vnsrl.wi v16, v8, 0, v0.t +; RV64-NEXT: vnsrl.wi v8, v16, 0, v0.t ; RV64-NEXT: bltu a4, a2, .LBB16_10 ; RV64-NEXT: # %bb.9: ; RV64-NEXT: li a4, 16 ; RV64-NEXT: .LBB16_10: ; RV64-NEXT: vmv1r.v v0, v5 -; RV64-NEXT: addi a1, sp, 32 -; RV64-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 32 +; RV64-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload ; RV64-NEXT: vsetvli zero, a4, e32, m4, ta, ma -; RV64-NEXT: vnsrl.wi v24, v8, 0, v0.t +; RV64-NEXT: vnsrl.wi v16, v24, 0, v0.t ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: li a4, 48 ; RV64-NEXT: mul a1, a1, a4 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 32 -; RV64-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill +; RV64-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill ; RV64-NEXT: mv a1, a7 ; RV64-NEXT: bltu a7, a3, .LBB16_12 ; RV64-NEXT: # %bb.11: ; RV64-NEXT: li a1, 32 ; RV64-NEXT: .LBB16_12: -; RV64-NEXT: vmv1r.v v0, v30 +; RV64-NEXT: vmv1r.v v0, v12 +; RV64-NEXT: addi a4, sp, 32 +; RV64-NEXT: vl8r.v v16, (a4) # vscale x 64-byte Folded Reload +; RV64-NEXT: vmv4r.v v24, v16 +; RV64-NEXT: csrr a4, vlenb +; RV64-NEXT: slli a4, a4, 6 +; RV64-NEXT: add a4, sp, a4 +; RV64-NEXT: addi a4, a4, 32 +; RV64-NEXT: vl8r.v v16, (a4) # vscale x 64-byte Folded Reload +; RV64-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; RV64-NEXT: vslideup.vi v16, v24, 16 +; RV64-NEXT: csrr a4, vlenb +; RV64-NEXT: slli a4, a4, 6 +; RV64-NEXT: add a4, sp, a4 +; RV64-NEXT: addi a4, a4, 32 +; RV64-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill ; RV64-NEXT: csrr a4, vlenb ; RV64-NEXT: li a5, 24 ; RV64-NEXT: mul a4, a4, a5 ; RV64-NEXT: add a4, sp, a4 ; RV64-NEXT: addi a4, a4, 32 -; RV64-NEXT: vl8r.v v8, (a4) # vscale x 64-byte Folded Reload +; RV64-NEXT: vl8r.v v16, (a4) # vscale x 64-byte Folded Reload +; RV64-NEXT: vmv4r.v v24, v16 ; RV64-NEXT: csrr a4, vlenb -; RV64-NEXT: slli a4, a4, 4 +; RV64-NEXT: li a5, 56 +; RV64-NEXT: mul a4, a4, a5 ; RV64-NEXT: add a4, sp, a4 ; RV64-NEXT: addi a4, a4, 32 -; RV64-NEXT: vl8r.v v24, (a4) # vscale x 64-byte Folded Reload +; RV64-NEXT: vl8r.v v16, (a4) # vscale x 64-byte Folded Reload +; RV64-NEXT: vslideup.vi v16, v24, 16 ; RV64-NEXT: csrr a4, vlenb -; RV64-NEXT: li a5, 24 +; RV64-NEXT: li a5, 56 ; RV64-NEXT: mul a4, a4, a5 ; RV64-NEXT: add a4, sp, a4 ; RV64-NEXT: addi a4, a4, 32 ; RV64-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill ; RV64-NEXT: csrr a4, vlenb -; RV64-NEXT: slli a4, a4, 6 +; RV64-NEXT: li a5, 48 +; RV64-NEXT: mul a4, a4, a5 ; RV64-NEXT: add a4, sp, a4 ; RV64-NEXT: addi a4, a4, 32 ; RV64-NEXT: vl8r.v v16, (a4) # vscale x 64-byte Folded Reload -; RV64-NEXT: vsetvli zero, a3, e32, m8, ta, ma ; RV64-NEXT: vslideup.vi v16, v8, 16 ; RV64-NEXT: csrr a4, vlenb -; RV64-NEXT: slli a4, a4, 6 +; RV64-NEXT: li a5, 48 +; RV64-NEXT: mul a4, a4, a5 ; RV64-NEXT: add a4, sp, a4 ; RV64-NEXT: addi a4, a4, 32 ; RV64-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill ; RV64-NEXT: addi a4, a1, -16 -; RV64-NEXT: csrr a5, vlenb -; RV64-NEXT: li a6, 56 -; RV64-NEXT: mul a5, a5, a6 -; RV64-NEXT: add a5, sp, a5 -; RV64-NEXT: addi a5, a5, 32 -; RV64-NEXT: vl8r.v v16, (a5) # vscale x 64-byte Folded Reload -; RV64-NEXT: vslideup.vi v16, v24, 16 -; RV64-NEXT: csrr a5, vlenb -; RV64-NEXT: li a6, 56 -; RV64-NEXT: mul a5, a5, a6 -; RV64-NEXT: add a5, sp, a5 -; RV64-NEXT: addi a5, a5, 32 -; RV64-NEXT: vs8r.v v16, (a5) # vscale x 64-byte Folded Spill -; RV64-NEXT: csrr a5, vlenb -; RV64-NEXT: li a6, 48 -; RV64-NEXT: mul a5, a5, a6 -; RV64-NEXT: add a5, sp, a5 -; RV64-NEXT: addi a5, a5, 32 -; RV64-NEXT: vl8r.v v16, (a5) # vscale x 64-byte Folded Reload -; RV64-NEXT: csrr a5, vlenb -; RV64-NEXT: li a6, 24 -; RV64-NEXT: mul a5, a5, a6 -; RV64-NEXT: add a5, sp, a5 -; RV64-NEXT: addi a5, a5, 32 -; RV64-NEXT: vl8r.v v8, (a5) # vscale x 64-byte Folded Reload -; RV64-NEXT: vslideup.vi v16, v8, 16 -; RV64-NEXT: csrr a5, vlenb -; RV64-NEXT: li a6, 48 -; RV64-NEXT: mul a5, a5, a6 -; RV64-NEXT: add a5, sp, a5 -; RV64-NEXT: addi a5, a5, 32 -; RV64-NEXT: vs8r.v v16, (a5) # vscale x 64-byte Folded Spill ; RV64-NEXT: sltu a1, a1, a4 ; RV64-NEXT: addi a1, a1, -1 ; RV64-NEXT: and a1, a1, a4 @@ -802,12 +802,12 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 32 -; RV64-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload +; RV64-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload ; RV64-NEXT: vsetvli zero, a7, e32, m4, ta, ma -; RV64-NEXT: vnsrl.wi v24, v16, 0, v0.t +; RV64-NEXT: vnsrl.wi v16, v24, 0, v0.t ; RV64-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV64-NEXT: vslideup.vi v24, v8, 16 -; RV64-NEXT: vse32.v v24, (a0) +; RV64-NEXT: vslideup.vi v16, v8, 16 +; RV64-NEXT: vse32.v v16, (a0) ; RV64-NEXT: addi a1, a0, 256 ; RV64-NEXT: csrr a2, vlenb ; RV64-NEXT: li a3, 48 @@ -865,11 +865,11 @@ define <32 x i32> @vtrunc_v32i32_v32i64(<32 x i64> %a, <32 x i1> %m, i32 zeroext ; CHECK-NEXT: sltu a0, a0, a1 ; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: and a0, a0, a1 +; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma ; CHECK-NEXT: vnsrl.wi v24, v16, 0, v0.t -; CHECK-NEXT: li a0, 32 -; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; CHECK-NEXT: vslideup.vi v8, v24, 16 ; CHECK-NEXT: ret %v = call <32 x i32> @llvm.vp.trunc.v32i32.v32i64(<32 x i64> %a, <32 x i1> %m, i32 %vl) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll index 352666de57881..0b8aee7730009 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll @@ -299,12 +299,12 @@ define <32 x i8> @vpgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> % ; RV64-NEXT: vsetivli zero, 16, e8, m2, ta, ma ; RV64-NEXT: vslidedown.vi v8, v8, 16 ; RV64-NEXT: sltu a1, a1, a2 -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vsext.vf8 v16, v8 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64-NEXT: vslidedown.vi v0, v0, 2 ; RV64-NEXT: addi a1, a1, -1 ; RV64-NEXT: and a1, a1, a2 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-NEXT: vsext.vf8 v16, v8 ; RV64-NEXT: vsetvli zero, a1, e8, m1, ta, ma ; RV64-NEXT: vluxei64.v v8, (a0), v16, v0.t ; RV64-NEXT: li a0, 32 diff --git a/llvm/test/CodeGen/RISCV/rvv/fmaximumnum-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fmaximumnum-sdnode.ll index dce5004d03e16..404f1c6d71870 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fmaximumnum-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fmaximumnum-sdnode.ll @@ -208,11 +208,9 @@ define @vfmax_vv_nxv32bf16( %va, @vfmax_vf_nxv32bf16( %va, bf ; CHECK-NEXT: fmv.x.h a0, fa0 ; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma ; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfmax.vv v16, v16, v0 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfmax.vv v0, v8, v0 -; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0 -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfmax.vv v16, v24, v16 +; CHECK-NEXT: vfmax.vv v24, v8, v24 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -536,11 +532,9 @@ define @vfmax_vv_nxv32f16( %va, @vfmax_vf_nxv32f16( %va, half %b ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 -; ZVFHMIN-NEXT: addi a1, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 +; ZVFHMIN-NEXT: addi a1, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v8, a0 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfmax.vv v16, v16, v0 ; ZVFHMIN-NEXT: addi a0, sp, 16 ; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmax.vv v0, v8, v0 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v0 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmax.vv v16, v24, v16 +; ZVFHMIN-NEXT: vfmax.vv v24, v8, v24 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: add sp, sp, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fminimumnum-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fminimumnum-sdnode.ll index fcb8ad82342d5..83186f73494cb 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fminimumnum-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fminimumnum-sdnode.ll @@ -208,11 +208,9 @@ define @vfadd_vv_nxv32bf16( %va, @vfadd_vf_nxv32bf16( %va, bf ; CHECK-NEXT: fmv.x.h a0, fa0 ; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma ; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfmax.vv v16, v16, v0 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfmax.vv v0, v8, v0 -; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0 -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfmax.vv v16, v24, v16 +; CHECK-NEXT: vfmax.vv v24, v8, v24 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -536,11 +532,9 @@ define @vfadd_vv_nxv32f16( %va, @vfadd_vf_nxv32f16( %va, half %b ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 -; ZVFHMIN-NEXT: addi a1, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 +; ZVFHMIN-NEXT: addi a1, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v8, a0 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfmax.vv v16, v16, v0 ; ZVFHMIN-NEXT: addi a0, sp, 16 ; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmax.vv v0, v8, v0 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v0 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmax.vv v16, v24, v16 +; ZVFHMIN-NEXT: vfmax.vv v24, v8, v24 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: add sp, sp, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll index 65ac424c2359a..168e40b089c8b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll @@ -2285,9 +2285,9 @@ define @mgather_baseidx_nxv16i8(ptr %base, ; RV64: # %bb.0: ; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, ma ; RV64-NEXT: vsext.vf8 v16, v8 +; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; RV64-NEXT: vluxei64.v v10, (a0), v16, v0.t -; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: srli a1, a1, 3 ; RV64-NEXT: vsetvli a2, zero, e8, mf4, ta, ma ; RV64-NEXT: vslidedown.vx v0, v0, a1 @@ -2309,9 +2309,9 @@ define @mgather_baseidx_nxv32i8(ptr %base, ; RV32: # %bb.0: ; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma ; RV32-NEXT: vsext.vf4 v16, v8 +; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: vsetvli zero, zero, e8, m2, ta, mu ; RV32-NEXT: vluxei32.v v12, (a0), v16, v0.t -; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: srli a1, a1, 2 ; RV32-NEXT: vsetvli a2, zero, e8, mf2, ta, ma ; RV32-NEXT: vslidedown.vx v0, v0, a1 @@ -2325,30 +2325,30 @@ define @mgather_baseidx_nxv32i8(ptr %base, ; RV64-LABEL: mgather_baseidx_nxv32i8: ; RV64: # %bb.0: ; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, ma -; RV64-NEXT: vmv1r.v v16, v0 -; RV64-NEXT: vsext.vf8 v24, v8 +; RV64-NEXT: vsext.vf8 v16, v8 ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, mu -; RV64-NEXT: vluxei64.v v12, (a0), v24, v0.t +; RV64-NEXT: vluxei64.v v12, (a0), v16, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64-NEXT: vsext.vf8 v16, v9 ; RV64-NEXT: srli a2, a1, 3 -; RV64-NEXT: vsetvli a3, zero, e8, mf4, ta, ma -; RV64-NEXT: vslidedown.vx v0, v0, a2 -; RV64-NEXT: vsetvli a3, zero, e64, m8, ta, ma -; RV64-NEXT: vsext.vf8 v24, v9 -; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, mu -; RV64-NEXT: vluxei64.v v13, (a0), v24, v0.t ; RV64-NEXT: srli a1, a1, 2 +; RV64-NEXT: vsetvli a3, zero, e8, mf4, ta, ma +; RV64-NEXT: vslidedown.vx v8, v0, a2 ; RV64-NEXT: vsetvli a3, zero, e8, mf2, ta, ma -; RV64-NEXT: vslidedown.vx v8, v16, a1 +; RV64-NEXT: vslidedown.vx v9, v0, a1 +; RV64-NEXT: vmv1r.v v0, v8 +; RV64-NEXT: vsetvli a1, zero, e8, m1, ta, mu +; RV64-NEXT: vluxei64.v v13, (a0), v16, v0.t ; RV64-NEXT: vsetvli a1, zero, e8, mf4, ta, ma -; RV64-NEXT: vslidedown.vx v0, v8, a2 +; RV64-NEXT: vslidedown.vx v0, v9, a2 ; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, ma ; RV64-NEXT: vsext.vf8 v16, v11 ; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; RV64-NEXT: vluxei64.v v15, (a0), v16, v0.t ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; RV64-NEXT: vsext.vf8 v16, v10 -; RV64-NEXT: vmv1r.v v0, v8 +; RV64-NEXT: vmv1r.v v0, v9 ; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; RV64-NEXT: vluxei64.v v14, (a0), v16, v0.t ; RV64-NEXT: vmv4r.v v8, v12 diff --git a/llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll b/llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll index d995a31f243d3..9c88dc9c1b2a5 100644 --- a/llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll +++ b/llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll @@ -295,11 +295,10 @@ define @reverse_nxv16i1( %a) { ; RV32-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e8, m2, ta, ma ; RV32-BITS-UNKNOWN-NEXT: vmv.v.i v10, 0 ; RV32-BITS-UNKNOWN-NEXT: addi a0, a0, -1 +; RV32-BITS-UNKNOWN-NEXT: vmerge.vim v10, v10, 1, v0 ; RV32-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; RV32-BITS-UNKNOWN-NEXT: vrsub.vx v8, v8, a0 -; RV32-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; RV32-BITS-UNKNOWN-NEXT: vmerge.vim v10, v10, 1, v0 -; RV32-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; RV32-BITS-UNKNOWN-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v13, v10, v8 ; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v12, v11, v8 ; RV32-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m2, ta, ma @@ -314,11 +313,9 @@ define @reverse_nxv16i1( %a) { ; RV32-BITS-256-NEXT: vsetvli a1, zero, e8, m2, ta, ma ; RV32-BITS-256-NEXT: vmv.v.i v8, 0 ; RV32-BITS-256-NEXT: addi a0, a0, -1 +; RV32-BITS-256-NEXT: vmerge.vim v8, v8, 1, v0 ; RV32-BITS-256-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; RV32-BITS-256-NEXT: vrsub.vx v12, v10, a0 -; RV32-BITS-256-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; RV32-BITS-256-NEXT: vmerge.vim v8, v8, 1, v0 -; RV32-BITS-256-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; RV32-BITS-256-NEXT: vrgather.vv v11, v8, v12 ; RV32-BITS-256-NEXT: vrgather.vv v10, v9, v12 ; RV32-BITS-256-NEXT: vsetvli a0, zero, e8, m2, ta, ma @@ -333,11 +330,9 @@ define @reverse_nxv16i1( %a) { ; RV32-BITS-512-NEXT: vsetvli a1, zero, e8, m2, ta, ma ; RV32-BITS-512-NEXT: vmv.v.i v8, 0 ; RV32-BITS-512-NEXT: addi a0, a0, -1 +; RV32-BITS-512-NEXT: vmerge.vim v8, v8, 1, v0 ; RV32-BITS-512-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; RV32-BITS-512-NEXT: vrsub.vx v12, v10, a0 -; RV32-BITS-512-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; RV32-BITS-512-NEXT: vmerge.vim v8, v8, 1, v0 -; RV32-BITS-512-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; RV32-BITS-512-NEXT: vrgather.vv v11, v8, v12 ; RV32-BITS-512-NEXT: vrgather.vv v10, v9, v12 ; RV32-BITS-512-NEXT: vsetvli a0, zero, e8, m2, ta, ma @@ -352,11 +347,10 @@ define @reverse_nxv16i1( %a) { ; RV64-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e8, m2, ta, ma ; RV64-BITS-UNKNOWN-NEXT: vmv.v.i v10, 0 ; RV64-BITS-UNKNOWN-NEXT: addi a0, a0, -1 +; RV64-BITS-UNKNOWN-NEXT: vmerge.vim v10, v10, 1, v0 ; RV64-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; RV64-BITS-UNKNOWN-NEXT: vrsub.vx v8, v8, a0 -; RV64-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; RV64-BITS-UNKNOWN-NEXT: vmerge.vim v10, v10, 1, v0 -; RV64-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; RV64-BITS-UNKNOWN-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v13, v10, v8 ; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v12, v11, v8 ; RV64-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m2, ta, ma @@ -371,11 +365,9 @@ define @reverse_nxv16i1( %a) { ; RV64-BITS-256-NEXT: vsetvli a1, zero, e8, m2, ta, ma ; RV64-BITS-256-NEXT: vmv.v.i v8, 0 ; RV64-BITS-256-NEXT: addi a0, a0, -1 +; RV64-BITS-256-NEXT: vmerge.vim v8, v8, 1, v0 ; RV64-BITS-256-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; RV64-BITS-256-NEXT: vrsub.vx v12, v10, a0 -; RV64-BITS-256-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; RV64-BITS-256-NEXT: vmerge.vim v8, v8, 1, v0 -; RV64-BITS-256-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; RV64-BITS-256-NEXT: vrgather.vv v11, v8, v12 ; RV64-BITS-256-NEXT: vrgather.vv v10, v9, v12 ; RV64-BITS-256-NEXT: vsetvli a0, zero, e8, m2, ta, ma @@ -390,11 +382,9 @@ define @reverse_nxv16i1( %a) { ; RV64-BITS-512-NEXT: vsetvli a1, zero, e8, m2, ta, ma ; RV64-BITS-512-NEXT: vmv.v.i v8, 0 ; RV64-BITS-512-NEXT: addi a0, a0, -1 +; RV64-BITS-512-NEXT: vmerge.vim v8, v8, 1, v0 ; RV64-BITS-512-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; RV64-BITS-512-NEXT: vrsub.vx v12, v10, a0 -; RV64-BITS-512-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; RV64-BITS-512-NEXT: vmerge.vim v8, v8, 1, v0 -; RV64-BITS-512-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; RV64-BITS-512-NEXT: vrgather.vv v11, v8, v12 ; RV64-BITS-512-NEXT: vrgather.vv v10, v9, v12 ; RV64-BITS-512-NEXT: vsetvli a0, zero, e8, m2, ta, ma @@ -409,15 +399,14 @@ define @reverse_nxv32i1( %a) { ; RV32-BITS-UNKNOWN: # %bb.0: ; RV32-BITS-UNKNOWN-NEXT: csrr a0, vlenb ; RV32-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; RV32-BITS-UNKNOWN-NEXT: vid.v v12 +; RV32-BITS-UNKNOWN-NEXT: vid.v v16 ; RV32-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e8, m4, ta, ma ; RV32-BITS-UNKNOWN-NEXT: vmv.v.i v8, 0 ; RV32-BITS-UNKNOWN-NEXT: addi a0, a0, -1 -; RV32-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; RV32-BITS-UNKNOWN-NEXT: vrsub.vx v16, v12, a0 -; RV32-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m4, ta, ma ; RV32-BITS-UNKNOWN-NEXT: vmerge.vim v12, v8, 1, v0 -; RV32-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; RV32-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; RV32-BITS-UNKNOWN-NEXT: vrsub.vx v16, v16, a0 +; RV32-BITS-UNKNOWN-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v11, v12, v16 ; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v10, v13, v16 ; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v9, v14, v16 @@ -430,15 +419,13 @@ define @reverse_nxv32i1( %a) { ; RV32-BITS-256: # %bb.0: ; RV32-BITS-256-NEXT: csrr a0, vlenb ; RV32-BITS-256-NEXT: vsetvli a1, zero, e8, m1, ta, ma -; RV32-BITS-256-NEXT: vid.v v12 +; RV32-BITS-256-NEXT: vid.v v16 ; RV32-BITS-256-NEXT: vsetvli a1, zero, e8, m4, ta, ma ; RV32-BITS-256-NEXT: vmv.v.i v8, 0 ; RV32-BITS-256-NEXT: addi a0, a0, -1 -; RV32-BITS-256-NEXT: vsetvli a1, zero, e8, m1, ta, ma -; RV32-BITS-256-NEXT: vrsub.vx v16, v12, a0 -; RV32-BITS-256-NEXT: vsetvli a0, zero, e8, m4, ta, ma ; RV32-BITS-256-NEXT: vmerge.vim v12, v8, 1, v0 -; RV32-BITS-256-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; RV32-BITS-256-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; RV32-BITS-256-NEXT: vrsub.vx v16, v16, a0 ; RV32-BITS-256-NEXT: vrgather.vv v11, v12, v16 ; RV32-BITS-256-NEXT: vrgather.vv v10, v13, v16 ; RV32-BITS-256-NEXT: vrgather.vv v9, v14, v16 @@ -451,15 +438,13 @@ define @reverse_nxv32i1( %a) { ; RV32-BITS-512: # %bb.0: ; RV32-BITS-512-NEXT: csrr a0, vlenb ; RV32-BITS-512-NEXT: vsetvli a1, zero, e8, m1, ta, ma -; RV32-BITS-512-NEXT: vid.v v12 +; RV32-BITS-512-NEXT: vid.v v16 ; RV32-BITS-512-NEXT: vsetvli a1, zero, e8, m4, ta, ma ; RV32-BITS-512-NEXT: vmv.v.i v8, 0 ; RV32-BITS-512-NEXT: addi a0, a0, -1 -; RV32-BITS-512-NEXT: vsetvli a1, zero, e8, m1, ta, ma -; RV32-BITS-512-NEXT: vrsub.vx v16, v12, a0 -; RV32-BITS-512-NEXT: vsetvli a0, zero, e8, m4, ta, ma ; RV32-BITS-512-NEXT: vmerge.vim v12, v8, 1, v0 -; RV32-BITS-512-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; RV32-BITS-512-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; RV32-BITS-512-NEXT: vrsub.vx v16, v16, a0 ; RV32-BITS-512-NEXT: vrgather.vv v11, v12, v16 ; RV32-BITS-512-NEXT: vrgather.vv v10, v13, v16 ; RV32-BITS-512-NEXT: vrgather.vv v9, v14, v16 @@ -472,15 +457,14 @@ define @reverse_nxv32i1( %a) { ; RV64-BITS-UNKNOWN: # %bb.0: ; RV64-BITS-UNKNOWN-NEXT: csrr a0, vlenb ; RV64-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; RV64-BITS-UNKNOWN-NEXT: vid.v v12 +; RV64-BITS-UNKNOWN-NEXT: vid.v v16 ; RV64-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e8, m4, ta, ma ; RV64-BITS-UNKNOWN-NEXT: vmv.v.i v8, 0 ; RV64-BITS-UNKNOWN-NEXT: addi a0, a0, -1 -; RV64-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; RV64-BITS-UNKNOWN-NEXT: vrsub.vx v16, v12, a0 -; RV64-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m4, ta, ma ; RV64-BITS-UNKNOWN-NEXT: vmerge.vim v12, v8, 1, v0 -; RV64-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; RV64-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; RV64-BITS-UNKNOWN-NEXT: vrsub.vx v16, v16, a0 +; RV64-BITS-UNKNOWN-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v11, v12, v16 ; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v10, v13, v16 ; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v9, v14, v16 @@ -493,15 +477,13 @@ define @reverse_nxv32i1( %a) { ; RV64-BITS-256: # %bb.0: ; RV64-BITS-256-NEXT: csrr a0, vlenb ; RV64-BITS-256-NEXT: vsetvli a1, zero, e8, m1, ta, ma -; RV64-BITS-256-NEXT: vid.v v12 +; RV64-BITS-256-NEXT: vid.v v16 ; RV64-BITS-256-NEXT: vsetvli a1, zero, e8, m4, ta, ma ; RV64-BITS-256-NEXT: vmv.v.i v8, 0 ; RV64-BITS-256-NEXT: addi a0, a0, -1 -; RV64-BITS-256-NEXT: vsetvli a1, zero, e8, m1, ta, ma -; RV64-BITS-256-NEXT: vrsub.vx v16, v12, a0 -; RV64-BITS-256-NEXT: vsetvli a0, zero, e8, m4, ta, ma ; RV64-BITS-256-NEXT: vmerge.vim v12, v8, 1, v0 -; RV64-BITS-256-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; RV64-BITS-256-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; RV64-BITS-256-NEXT: vrsub.vx v16, v16, a0 ; RV64-BITS-256-NEXT: vrgather.vv v11, v12, v16 ; RV64-BITS-256-NEXT: vrgather.vv v10, v13, v16 ; RV64-BITS-256-NEXT: vrgather.vv v9, v14, v16 @@ -514,15 +496,13 @@ define @reverse_nxv32i1( %a) { ; RV64-BITS-512: # %bb.0: ; RV64-BITS-512-NEXT: csrr a0, vlenb ; RV64-BITS-512-NEXT: vsetvli a1, zero, e8, m1, ta, ma -; RV64-BITS-512-NEXT: vid.v v12 +; RV64-BITS-512-NEXT: vid.v v16 ; RV64-BITS-512-NEXT: vsetvli a1, zero, e8, m4, ta, ma ; RV64-BITS-512-NEXT: vmv.v.i v8, 0 ; RV64-BITS-512-NEXT: addi a0, a0, -1 -; RV64-BITS-512-NEXT: vsetvli a1, zero, e8, m1, ta, ma -; RV64-BITS-512-NEXT: vrsub.vx v16, v12, a0 -; RV64-BITS-512-NEXT: vsetvli a0, zero, e8, m4, ta, ma ; RV64-BITS-512-NEXT: vmerge.vim v12, v8, 1, v0 -; RV64-BITS-512-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; RV64-BITS-512-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; RV64-BITS-512-NEXT: vrsub.vx v16, v16, a0 ; RV64-BITS-512-NEXT: vrgather.vv v11, v12, v16 ; RV64-BITS-512-NEXT: vrgather.vv v10, v13, v16 ; RV64-BITS-512-NEXT: vrgather.vv v9, v14, v16 @@ -539,15 +519,14 @@ define @reverse_nxv64i1( %a) { ; RV32-BITS-UNKNOWN: # %bb.0: ; RV32-BITS-UNKNOWN-NEXT: csrr a0, vlenb ; RV32-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; RV32-BITS-UNKNOWN-NEXT: vid.v v16 +; RV32-BITS-UNKNOWN-NEXT: vid.v v24 ; RV32-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e8, m8, ta, ma ; RV32-BITS-UNKNOWN-NEXT: vmv.v.i v8, 0 ; RV32-BITS-UNKNOWN-NEXT: addi a0, a0, -1 -; RV32-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; RV32-BITS-UNKNOWN-NEXT: vrsub.vx v24, v16, a0 -; RV32-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m8, ta, ma ; RV32-BITS-UNKNOWN-NEXT: vmerge.vim v16, v8, 1, v0 -; RV32-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; RV32-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; RV32-BITS-UNKNOWN-NEXT: vrsub.vx v24, v24, a0 +; RV32-BITS-UNKNOWN-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v15, v16, v24 ; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v14, v17, v24 ; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v13, v18, v24 @@ -564,15 +543,13 @@ define @reverse_nxv64i1( %a) { ; RV32-BITS-256: # %bb.0: ; RV32-BITS-256-NEXT: csrr a0, vlenb ; RV32-BITS-256-NEXT: vsetvli a1, zero, e8, m1, ta, ma -; RV32-BITS-256-NEXT: vid.v v16 +; RV32-BITS-256-NEXT: vid.v v24 ; RV32-BITS-256-NEXT: vsetvli a1, zero, e8, m8, ta, ma ; RV32-BITS-256-NEXT: vmv.v.i v8, 0 ; RV32-BITS-256-NEXT: addi a0, a0, -1 -; RV32-BITS-256-NEXT: vsetvli a1, zero, e8, m1, ta, ma -; RV32-BITS-256-NEXT: vrsub.vx v24, v16, a0 -; RV32-BITS-256-NEXT: vsetvli a0, zero, e8, m8, ta, ma ; RV32-BITS-256-NEXT: vmerge.vim v16, v8, 1, v0 -; RV32-BITS-256-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; RV32-BITS-256-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; RV32-BITS-256-NEXT: vrsub.vx v24, v24, a0 ; RV32-BITS-256-NEXT: vrgather.vv v15, v16, v24 ; RV32-BITS-256-NEXT: vrgather.vv v14, v17, v24 ; RV32-BITS-256-NEXT: vrgather.vv v13, v18, v24 @@ -589,15 +566,13 @@ define @reverse_nxv64i1( %a) { ; RV32-BITS-512: # %bb.0: ; RV32-BITS-512-NEXT: csrr a0, vlenb ; RV32-BITS-512-NEXT: vsetvli a1, zero, e8, m1, ta, ma -; RV32-BITS-512-NEXT: vid.v v16 +; RV32-BITS-512-NEXT: vid.v v24 ; RV32-BITS-512-NEXT: vsetvli a1, zero, e8, m8, ta, ma ; RV32-BITS-512-NEXT: vmv.v.i v8, 0 ; RV32-BITS-512-NEXT: addi a0, a0, -1 -; RV32-BITS-512-NEXT: vsetvli a1, zero, e8, m1, ta, ma -; RV32-BITS-512-NEXT: vrsub.vx v24, v16, a0 -; RV32-BITS-512-NEXT: vsetvli a0, zero, e8, m8, ta, ma ; RV32-BITS-512-NEXT: vmerge.vim v16, v8, 1, v0 -; RV32-BITS-512-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; RV32-BITS-512-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; RV32-BITS-512-NEXT: vrsub.vx v24, v24, a0 ; RV32-BITS-512-NEXT: vrgather.vv v15, v16, v24 ; RV32-BITS-512-NEXT: vrgather.vv v14, v17, v24 ; RV32-BITS-512-NEXT: vrgather.vv v13, v18, v24 @@ -614,15 +589,14 @@ define @reverse_nxv64i1( %a) { ; RV64-BITS-UNKNOWN: # %bb.0: ; RV64-BITS-UNKNOWN-NEXT: csrr a0, vlenb ; RV64-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; RV64-BITS-UNKNOWN-NEXT: vid.v v16 +; RV64-BITS-UNKNOWN-NEXT: vid.v v24 ; RV64-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e8, m8, ta, ma ; RV64-BITS-UNKNOWN-NEXT: vmv.v.i v8, 0 ; RV64-BITS-UNKNOWN-NEXT: addi a0, a0, -1 -; RV64-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e16, m2, ta, ma -; RV64-BITS-UNKNOWN-NEXT: vrsub.vx v24, v16, a0 -; RV64-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m8, ta, ma ; RV64-BITS-UNKNOWN-NEXT: vmerge.vim v16, v8, 1, v0 -; RV64-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; RV64-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; RV64-BITS-UNKNOWN-NEXT: vrsub.vx v24, v24, a0 +; RV64-BITS-UNKNOWN-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v15, v16, v24 ; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v14, v17, v24 ; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v13, v18, v24 @@ -639,15 +613,13 @@ define @reverse_nxv64i1( %a) { ; RV64-BITS-256: # %bb.0: ; RV64-BITS-256-NEXT: csrr a0, vlenb ; RV64-BITS-256-NEXT: vsetvli a1, zero, e8, m1, ta, ma -; RV64-BITS-256-NEXT: vid.v v16 +; RV64-BITS-256-NEXT: vid.v v24 ; RV64-BITS-256-NEXT: vsetvli a1, zero, e8, m8, ta, ma ; RV64-BITS-256-NEXT: vmv.v.i v8, 0 ; RV64-BITS-256-NEXT: addi a0, a0, -1 -; RV64-BITS-256-NEXT: vsetvli a1, zero, e8, m1, ta, ma -; RV64-BITS-256-NEXT: vrsub.vx v24, v16, a0 -; RV64-BITS-256-NEXT: vsetvli a0, zero, e8, m8, ta, ma ; RV64-BITS-256-NEXT: vmerge.vim v16, v8, 1, v0 -; RV64-BITS-256-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; RV64-BITS-256-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; RV64-BITS-256-NEXT: vrsub.vx v24, v24, a0 ; RV64-BITS-256-NEXT: vrgather.vv v15, v16, v24 ; RV64-BITS-256-NEXT: vrgather.vv v14, v17, v24 ; RV64-BITS-256-NEXT: vrgather.vv v13, v18, v24 @@ -664,15 +636,13 @@ define @reverse_nxv64i1( %a) { ; RV64-BITS-512: # %bb.0: ; RV64-BITS-512-NEXT: csrr a0, vlenb ; RV64-BITS-512-NEXT: vsetvli a1, zero, e8, m1, ta, ma -; RV64-BITS-512-NEXT: vid.v v16 +; RV64-BITS-512-NEXT: vid.v v24 ; RV64-BITS-512-NEXT: vsetvli a1, zero, e8, m8, ta, ma ; RV64-BITS-512-NEXT: vmv.v.i v8, 0 ; RV64-BITS-512-NEXT: addi a0, a0, -1 -; RV64-BITS-512-NEXT: vsetvli a1, zero, e8, m1, ta, ma -; RV64-BITS-512-NEXT: vrsub.vx v24, v16, a0 -; RV64-BITS-512-NEXT: vsetvli a0, zero, e8, m8, ta, ma ; RV64-BITS-512-NEXT: vmerge.vim v16, v8, 1, v0 -; RV64-BITS-512-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; RV64-BITS-512-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; RV64-BITS-512-NEXT: vrsub.vx v24, v24, a0 ; RV64-BITS-512-NEXT: vrgather.vv v15, v16, v24 ; RV64-BITS-512-NEXT: vrgather.vv v14, v17, v24 ; RV64-BITS-512-NEXT: vrgather.vv v13, v18, v24 @@ -1002,9 +972,9 @@ define @reverse_nxv16i8( %a) { ; RV32-BITS-UNKNOWN-LABEL: reverse_nxv16i8: ; RV32-BITS-UNKNOWN: # %bb.0: ; RV32-BITS-UNKNOWN-NEXT: csrr a0, vlenb -; RV32-BITS-UNKNOWN-NEXT: addi a0, a0, -1 ; RV32-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; RV32-BITS-UNKNOWN-NEXT: vid.v v10 +; RV32-BITS-UNKNOWN-NEXT: addi a0, a0, -1 ; RV32-BITS-UNKNOWN-NEXT: vrsub.vx v12, v10, a0 ; RV32-BITS-UNKNOWN-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v11, v8, v12 @@ -1015,9 +985,9 @@ define @reverse_nxv16i8( %a) { ; RV32-BITS-256-LABEL: reverse_nxv16i8: ; RV32-BITS-256: # %bb.0: ; RV32-BITS-256-NEXT: csrr a0, vlenb -; RV32-BITS-256-NEXT: addi a0, a0, -1 ; RV32-BITS-256-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; RV32-BITS-256-NEXT: vid.v v10 +; RV32-BITS-256-NEXT: addi a0, a0, -1 ; RV32-BITS-256-NEXT: vrsub.vx v12, v10, a0 ; RV32-BITS-256-NEXT: vrgather.vv v11, v8, v12 ; RV32-BITS-256-NEXT: vrgather.vv v10, v9, v12 @@ -1027,9 +997,9 @@ define @reverse_nxv16i8( %a) { ; RV32-BITS-512-LABEL: reverse_nxv16i8: ; RV32-BITS-512: # %bb.0: ; RV32-BITS-512-NEXT: csrr a0, vlenb -; RV32-BITS-512-NEXT: addi a0, a0, -1 ; RV32-BITS-512-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; RV32-BITS-512-NEXT: vid.v v10 +; RV32-BITS-512-NEXT: addi a0, a0, -1 ; RV32-BITS-512-NEXT: vrsub.vx v12, v10, a0 ; RV32-BITS-512-NEXT: vrgather.vv v11, v8, v12 ; RV32-BITS-512-NEXT: vrgather.vv v10, v9, v12 @@ -1039,9 +1009,9 @@ define @reverse_nxv16i8( %a) { ; RV64-BITS-UNKNOWN-LABEL: reverse_nxv16i8: ; RV64-BITS-UNKNOWN: # %bb.0: ; RV64-BITS-UNKNOWN-NEXT: csrr a0, vlenb -; RV64-BITS-UNKNOWN-NEXT: addi a0, a0, -1 ; RV64-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; RV64-BITS-UNKNOWN-NEXT: vid.v v10 +; RV64-BITS-UNKNOWN-NEXT: addi a0, a0, -1 ; RV64-BITS-UNKNOWN-NEXT: vrsub.vx v12, v10, a0 ; RV64-BITS-UNKNOWN-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v11, v8, v12 @@ -1052,9 +1022,9 @@ define @reverse_nxv16i8( %a) { ; RV64-BITS-256-LABEL: reverse_nxv16i8: ; RV64-BITS-256: # %bb.0: ; RV64-BITS-256-NEXT: csrr a0, vlenb -; RV64-BITS-256-NEXT: addi a0, a0, -1 ; RV64-BITS-256-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; RV64-BITS-256-NEXT: vid.v v10 +; RV64-BITS-256-NEXT: addi a0, a0, -1 ; RV64-BITS-256-NEXT: vrsub.vx v12, v10, a0 ; RV64-BITS-256-NEXT: vrgather.vv v11, v8, v12 ; RV64-BITS-256-NEXT: vrgather.vv v10, v9, v12 @@ -1064,9 +1034,9 @@ define @reverse_nxv16i8( %a) { ; RV64-BITS-512-LABEL: reverse_nxv16i8: ; RV64-BITS-512: # %bb.0: ; RV64-BITS-512-NEXT: csrr a0, vlenb -; RV64-BITS-512-NEXT: addi a0, a0, -1 ; RV64-BITS-512-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; RV64-BITS-512-NEXT: vid.v v10 +; RV64-BITS-512-NEXT: addi a0, a0, -1 ; RV64-BITS-512-NEXT: vrsub.vx v12, v10, a0 ; RV64-BITS-512-NEXT: vrgather.vv v11, v8, v12 ; RV64-BITS-512-NEXT: vrgather.vv v10, v9, v12 @@ -1080,9 +1050,9 @@ define @reverse_nxv32i8( %a) { ; RV32-BITS-UNKNOWN-LABEL: reverse_nxv32i8: ; RV32-BITS-UNKNOWN: # %bb.0: ; RV32-BITS-UNKNOWN-NEXT: csrr a0, vlenb -; RV32-BITS-UNKNOWN-NEXT: addi a0, a0, -1 ; RV32-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; RV32-BITS-UNKNOWN-NEXT: vid.v v12 +; RV32-BITS-UNKNOWN-NEXT: addi a0, a0, -1 ; RV32-BITS-UNKNOWN-NEXT: vrsub.vx v16, v12, a0 ; RV32-BITS-UNKNOWN-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v15, v8, v16 @@ -1095,9 +1065,9 @@ define @reverse_nxv32i8( %a) { ; RV32-BITS-256-LABEL: reverse_nxv32i8: ; RV32-BITS-256: # %bb.0: ; RV32-BITS-256-NEXT: csrr a0, vlenb -; RV32-BITS-256-NEXT: addi a0, a0, -1 ; RV32-BITS-256-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; RV32-BITS-256-NEXT: vid.v v12 +; RV32-BITS-256-NEXT: addi a0, a0, -1 ; RV32-BITS-256-NEXT: vrsub.vx v16, v12, a0 ; RV32-BITS-256-NEXT: vrgather.vv v15, v8, v16 ; RV32-BITS-256-NEXT: vrgather.vv v14, v9, v16 @@ -1109,9 +1079,9 @@ define @reverse_nxv32i8( %a) { ; RV32-BITS-512-LABEL: reverse_nxv32i8: ; RV32-BITS-512: # %bb.0: ; RV32-BITS-512-NEXT: csrr a0, vlenb -; RV32-BITS-512-NEXT: addi a0, a0, -1 ; RV32-BITS-512-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; RV32-BITS-512-NEXT: vid.v v12 +; RV32-BITS-512-NEXT: addi a0, a0, -1 ; RV32-BITS-512-NEXT: vrsub.vx v16, v12, a0 ; RV32-BITS-512-NEXT: vrgather.vv v15, v8, v16 ; RV32-BITS-512-NEXT: vrgather.vv v14, v9, v16 @@ -1123,9 +1093,9 @@ define @reverse_nxv32i8( %a) { ; RV64-BITS-UNKNOWN-LABEL: reverse_nxv32i8: ; RV64-BITS-UNKNOWN: # %bb.0: ; RV64-BITS-UNKNOWN-NEXT: csrr a0, vlenb -; RV64-BITS-UNKNOWN-NEXT: addi a0, a0, -1 ; RV64-BITS-UNKNOWN-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; RV64-BITS-UNKNOWN-NEXT: vid.v v12 +; RV64-BITS-UNKNOWN-NEXT: addi a0, a0, -1 ; RV64-BITS-UNKNOWN-NEXT: vrsub.vx v16, v12, a0 ; RV64-BITS-UNKNOWN-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v15, v8, v16 @@ -1138,9 +1108,9 @@ define @reverse_nxv32i8( %a) { ; RV64-BITS-256-LABEL: reverse_nxv32i8: ; RV64-BITS-256: # %bb.0: ; RV64-BITS-256-NEXT: csrr a0, vlenb -; RV64-BITS-256-NEXT: addi a0, a0, -1 ; RV64-BITS-256-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; RV64-BITS-256-NEXT: vid.v v12 +; RV64-BITS-256-NEXT: addi a0, a0, -1 ; RV64-BITS-256-NEXT: vrsub.vx v16, v12, a0 ; RV64-BITS-256-NEXT: vrgather.vv v15, v8, v16 ; RV64-BITS-256-NEXT: vrgather.vv v14, v9, v16 @@ -1152,9 +1122,9 @@ define @reverse_nxv32i8( %a) { ; RV64-BITS-512-LABEL: reverse_nxv32i8: ; RV64-BITS-512: # %bb.0: ; RV64-BITS-512-NEXT: csrr a0, vlenb -; RV64-BITS-512-NEXT: addi a0, a0, -1 ; RV64-BITS-512-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; RV64-BITS-512-NEXT: vid.v v12 +; RV64-BITS-512-NEXT: addi a0, a0, -1 ; RV64-BITS-512-NEXT: vrsub.vx v16, v12, a0 ; RV64-BITS-512-NEXT: vrgather.vv v15, v8, v16 ; RV64-BITS-512-NEXT: vrgather.vv v14, v9, v16 @@ -1172,8 +1142,8 @@ define @reverse_nxv64i8( %a) { ; RV32-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; RV32-BITS-UNKNOWN-NEXT: vmv8r.v v16, v8 ; RV32-BITS-UNKNOWN-NEXT: csrr a0, vlenb -; RV32-BITS-UNKNOWN-NEXT: addi a0, a0, -1 ; RV32-BITS-UNKNOWN-NEXT: vid.v v8 +; RV32-BITS-UNKNOWN-NEXT: addi a0, a0, -1 ; RV32-BITS-UNKNOWN-NEXT: vrsub.vx v24, v8, a0 ; RV32-BITS-UNKNOWN-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; RV32-BITS-UNKNOWN-NEXT: vrgatherei16.vv v15, v16, v24 @@ -1191,8 +1161,8 @@ define @reverse_nxv64i8( %a) { ; RV32-BITS-256-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; RV32-BITS-256-NEXT: vmv8r.v v16, v8 ; RV32-BITS-256-NEXT: csrr a0, vlenb -; RV32-BITS-256-NEXT: addi a0, a0, -1 ; RV32-BITS-256-NEXT: vid.v v8 +; RV32-BITS-256-NEXT: addi a0, a0, -1 ; RV32-BITS-256-NEXT: vrsub.vx v24, v8, a0 ; RV32-BITS-256-NEXT: vrgather.vv v15, v16, v24 ; RV32-BITS-256-NEXT: vrgather.vv v14, v17, v24 @@ -1209,8 +1179,8 @@ define @reverse_nxv64i8( %a) { ; RV32-BITS-512-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; RV32-BITS-512-NEXT: vmv8r.v v16, v8 ; RV32-BITS-512-NEXT: csrr a0, vlenb -; RV32-BITS-512-NEXT: addi a0, a0, -1 ; RV32-BITS-512-NEXT: vid.v v8 +; RV32-BITS-512-NEXT: addi a0, a0, -1 ; RV32-BITS-512-NEXT: vrsub.vx v24, v8, a0 ; RV32-BITS-512-NEXT: vrgather.vv v15, v16, v24 ; RV32-BITS-512-NEXT: vrgather.vv v14, v17, v24 @@ -1227,8 +1197,8 @@ define @reverse_nxv64i8( %a) { ; RV64-BITS-UNKNOWN-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; RV64-BITS-UNKNOWN-NEXT: vmv8r.v v16, v8 ; RV64-BITS-UNKNOWN-NEXT: csrr a0, vlenb -; RV64-BITS-UNKNOWN-NEXT: addi a0, a0, -1 ; RV64-BITS-UNKNOWN-NEXT: vid.v v8 +; RV64-BITS-UNKNOWN-NEXT: addi a0, a0, -1 ; RV64-BITS-UNKNOWN-NEXT: vrsub.vx v24, v8, a0 ; RV64-BITS-UNKNOWN-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; RV64-BITS-UNKNOWN-NEXT: vrgatherei16.vv v15, v16, v24 @@ -1246,8 +1216,8 @@ define @reverse_nxv64i8( %a) { ; RV64-BITS-256-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; RV64-BITS-256-NEXT: vmv8r.v v16, v8 ; RV64-BITS-256-NEXT: csrr a0, vlenb -; RV64-BITS-256-NEXT: addi a0, a0, -1 ; RV64-BITS-256-NEXT: vid.v v8 +; RV64-BITS-256-NEXT: addi a0, a0, -1 ; RV64-BITS-256-NEXT: vrsub.vx v24, v8, a0 ; RV64-BITS-256-NEXT: vrgather.vv v15, v16, v24 ; RV64-BITS-256-NEXT: vrgather.vv v14, v17, v24 @@ -1264,8 +1234,8 @@ define @reverse_nxv64i8( %a) { ; RV64-BITS-512-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; RV64-BITS-512-NEXT: vmv8r.v v16, v8 ; RV64-BITS-512-NEXT: csrr a0, vlenb -; RV64-BITS-512-NEXT: addi a0, a0, -1 ; RV64-BITS-512-NEXT: vid.v v8 +; RV64-BITS-512-NEXT: addi a0, a0, -1 ; RV64-BITS-512-NEXT: vrsub.vx v24, v8, a0 ; RV64-BITS-512-NEXT: vrgather.vv v15, v16, v24 ; RV64-BITS-512-NEXT: vrgather.vv v14, v17, v24 @@ -1332,10 +1302,10 @@ define @reverse_nxv8i16( %a) { ; CHECK-LABEL: reverse_nxv8i16: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a0, a0, 1 -; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma ; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: srli a0, a0, 1 +; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vrsub.vx v12, v10, a0 ; CHECK-NEXT: vrgather.vv v11, v8, v12 ; CHECK-NEXT: vrgather.vv v10, v9, v12 @@ -1349,10 +1319,10 @@ define @reverse_nxv16i16( %a) { ; CHECK-LABEL: reverse_nxv16i16: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a0, a0, 1 -; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma ; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: srli a0, a0, 1 +; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vrsub.vx v16, v12, a0 ; CHECK-NEXT: vrgather.vv v15, v8, v16 ; CHECK-NEXT: vrgather.vv v14, v9, v16 @@ -1367,21 +1337,21 @@ define @reverse_nxv16i16( %a) { define @reverse_nxv32i16( %a) { ; CHECK-LABEL: reverse_nxv32i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; CHECK-NEXT: vmv8r.v v16, v8 ; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vid.v v16 ; CHECK-NEXT: srli a0, a0, 1 ; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: vid.v v8 -; CHECK-NEXT: vrsub.vx v24, v8, a0 -; CHECK-NEXT: vrgather.vv v15, v16, v24 -; CHECK-NEXT: vrgather.vv v14, v17, v24 -; CHECK-NEXT: vrgather.vv v13, v18, v24 -; CHECK-NEXT: vrgather.vv v12, v19, v24 -; CHECK-NEXT: vrgather.vv v11, v20, v24 -; CHECK-NEXT: vrgather.vv v10, v21, v24 -; CHECK-NEXT: vrgather.vv v9, v22, v24 -; CHECK-NEXT: vrgather.vv v8, v23, v24 +; CHECK-NEXT: vrsub.vx v24, v16, a0 +; CHECK-NEXT: vrgather.vv v23, v8, v24 +; CHECK-NEXT: vrgather.vv v22, v9, v24 +; CHECK-NEXT: vrgather.vv v21, v10, v24 +; CHECK-NEXT: vrgather.vv v20, v11, v24 +; CHECK-NEXT: vrgather.vv v19, v12, v24 +; CHECK-NEXT: vrgather.vv v18, v13, v24 +; CHECK-NEXT: vrgather.vv v17, v14, v24 +; CHECK-NEXT: vrgather.vv v16, v15, v24 +; CHECK-NEXT: vmv8r.v v8, v16 ; CHECK-NEXT: ret %res = call @llvm.vector.reverse.nxv32i16( %a) ret %res @@ -1423,10 +1393,10 @@ define @reverse_nxv4i32( %a) { ; CHECK-LABEL: reverse_nxv4i32: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a0, a0, 2 -; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma ; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vrsub.vx v12, v10, a0 ; CHECK-NEXT: vrgather.vv v11, v8, v12 ; CHECK-NEXT: vrgather.vv v10, v9, v12 @@ -1440,10 +1410,10 @@ define @reverse_nxv8i32( %a) { ; CHECK-LABEL: reverse_nxv8i32: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a0, a0, 2 -; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma ; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vrsub.vx v16, v12, a0 ; CHECK-NEXT: vrgather.vv v15, v8, v16 ; CHECK-NEXT: vrgather.vv v14, v9, v16 @@ -1458,21 +1428,21 @@ define @reverse_nxv8i32( %a) { define @reverse_nxv16i32( %a) { ; CHECK-LABEL: reverse_nxv16i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma -; CHECK-NEXT: vmv8r.v v16, v8 ; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; CHECK-NEXT: vid.v v16 ; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: vid.v v8 -; CHECK-NEXT: vrsub.vx v24, v8, a0 -; CHECK-NEXT: vrgather.vv v15, v16, v24 -; CHECK-NEXT: vrgather.vv v14, v17, v24 -; CHECK-NEXT: vrgather.vv v13, v18, v24 -; CHECK-NEXT: vrgather.vv v12, v19, v24 -; CHECK-NEXT: vrgather.vv v11, v20, v24 -; CHECK-NEXT: vrgather.vv v10, v21, v24 -; CHECK-NEXT: vrgather.vv v9, v22, v24 -; CHECK-NEXT: vrgather.vv v8, v23, v24 +; CHECK-NEXT: vrsub.vx v24, v16, a0 +; CHECK-NEXT: vrgather.vv v23, v8, v24 +; CHECK-NEXT: vrgather.vv v22, v9, v24 +; CHECK-NEXT: vrgather.vv v21, v10, v24 +; CHECK-NEXT: vrgather.vv v20, v11, v24 +; CHECK-NEXT: vrgather.vv v19, v12, v24 +; CHECK-NEXT: vrgather.vv v18, v13, v24 +; CHECK-NEXT: vrgather.vv v17, v14, v24 +; CHECK-NEXT: vrgather.vv v16, v15, v24 +; CHECK-NEXT: vmv8r.v v8, v16 ; CHECK-NEXT: ret %res = call @llvm.vector.reverse.nxv16i32( %a) ret %res @@ -1498,10 +1468,10 @@ define @reverse_nxv2i64( %a) { ; CHECK-LABEL: reverse_nxv2i64: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a0, a0, 3 -; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma ; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vrsub.vx v12, v10, a0 ; CHECK-NEXT: vrgather.vv v11, v8, v12 ; CHECK-NEXT: vrgather.vv v10, v9, v12 @@ -1515,10 +1485,10 @@ define @reverse_nxv4i64( %a) { ; CHECK-LABEL: reverse_nxv4i64: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a0, a0, 3 -; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma ; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vrsub.vx v16, v12, a0 ; CHECK-NEXT: vrgather.vv v15, v8, v16 ; CHECK-NEXT: vrgather.vv v14, v9, v16 @@ -1533,21 +1503,21 @@ define @reverse_nxv4i64( %a) { define @reverse_nxv8i64( %a) { ; CHECK-LABEL: reverse_nxv8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; CHECK-NEXT: vmv8r.v v16, v8 ; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; CHECK-NEXT: vid.v v16 ; CHECK-NEXT: srli a0, a0, 3 ; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: vid.v v8 -; CHECK-NEXT: vrsub.vx v24, v8, a0 -; CHECK-NEXT: vrgather.vv v15, v16, v24 -; CHECK-NEXT: vrgather.vv v14, v17, v24 -; CHECK-NEXT: vrgather.vv v13, v18, v24 -; CHECK-NEXT: vrgather.vv v12, v19, v24 -; CHECK-NEXT: vrgather.vv v11, v20, v24 -; CHECK-NEXT: vrgather.vv v10, v21, v24 -; CHECK-NEXT: vrgather.vv v9, v22, v24 -; CHECK-NEXT: vrgather.vv v8, v23, v24 +; CHECK-NEXT: vrsub.vx v24, v16, a0 +; CHECK-NEXT: vrgather.vv v23, v8, v24 +; CHECK-NEXT: vrgather.vv v22, v9, v24 +; CHECK-NEXT: vrgather.vv v21, v10, v24 +; CHECK-NEXT: vrgather.vv v20, v11, v24 +; CHECK-NEXT: vrgather.vv v19, v12, v24 +; CHECK-NEXT: vrgather.vv v18, v13, v24 +; CHECK-NEXT: vrgather.vv v17, v14, v24 +; CHECK-NEXT: vrgather.vv v16, v15, v24 +; CHECK-NEXT: vmv8r.v v8, v16 ; CHECK-NEXT: ret %res = call @llvm.vector.reverse.nxv8i64( %a) ret %res @@ -1609,10 +1579,10 @@ define @reverse_nxv8bf16( %a) { ; CHECK-LABEL: reverse_nxv8bf16: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a0, a0, 1 -; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma ; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: srli a0, a0, 1 +; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vrsub.vx v12, v10, a0 ; CHECK-NEXT: vrgather.vv v11, v8, v12 ; CHECK-NEXT: vrgather.vv v10, v9, v12 @@ -1626,10 +1596,10 @@ define @reverse_nxv16bf16( %a) { ; CHECK-LABEL: reverse_nxv16bf16: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a0, a0, 1 -; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma ; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: srli a0, a0, 1 +; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vrsub.vx v16, v12, a0 ; CHECK-NEXT: vrgather.vv v15, v8, v16 ; CHECK-NEXT: vrgather.vv v14, v9, v16 @@ -1644,21 +1614,21 @@ define @reverse_nxv16bf16( %a) { define @reverse_nxv32bf16( %a) { ; CHECK-LABEL: reverse_nxv32bf16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; CHECK-NEXT: vmv8r.v v16, v8 ; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vid.v v16 ; CHECK-NEXT: srli a0, a0, 1 ; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: vid.v v8 -; CHECK-NEXT: vrsub.vx v24, v8, a0 -; CHECK-NEXT: vrgather.vv v15, v16, v24 -; CHECK-NEXT: vrgather.vv v14, v17, v24 -; CHECK-NEXT: vrgather.vv v13, v18, v24 -; CHECK-NEXT: vrgather.vv v12, v19, v24 -; CHECK-NEXT: vrgather.vv v11, v20, v24 -; CHECK-NEXT: vrgather.vv v10, v21, v24 -; CHECK-NEXT: vrgather.vv v9, v22, v24 -; CHECK-NEXT: vrgather.vv v8, v23, v24 +; CHECK-NEXT: vrsub.vx v24, v16, a0 +; CHECK-NEXT: vrgather.vv v23, v8, v24 +; CHECK-NEXT: vrgather.vv v22, v9, v24 +; CHECK-NEXT: vrgather.vv v21, v10, v24 +; CHECK-NEXT: vrgather.vv v20, v11, v24 +; CHECK-NEXT: vrgather.vv v19, v12, v24 +; CHECK-NEXT: vrgather.vv v18, v13, v24 +; CHECK-NEXT: vrgather.vv v17, v14, v24 +; CHECK-NEXT: vrgather.vv v16, v15, v24 +; CHECK-NEXT: vmv8r.v v8, v16 ; CHECK-NEXT: ret %res = call @llvm.vector.reverse.nxv32bf16( %a) ret %res @@ -1716,10 +1686,10 @@ define @reverse_nxv8f16( %a) { ; CHECK-LABEL: reverse_nxv8f16: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a0, a0, 1 -; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma ; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: srli a0, a0, 1 +; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vrsub.vx v12, v10, a0 ; CHECK-NEXT: vrgather.vv v11, v8, v12 ; CHECK-NEXT: vrgather.vv v10, v9, v12 @@ -1733,10 +1703,10 @@ define @reverse_nxv16f16( %a) { ; CHECK-LABEL: reverse_nxv16f16: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a0, a0, 1 -; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma ; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: srli a0, a0, 1 +; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vrsub.vx v16, v12, a0 ; CHECK-NEXT: vrgather.vv v15, v8, v16 ; CHECK-NEXT: vrgather.vv v14, v9, v16 @@ -1751,21 +1721,21 @@ define @reverse_nxv16f16( %a) { define @reverse_nxv32f16( %a) { ; CHECK-LABEL: reverse_nxv32f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; CHECK-NEXT: vmv8r.v v16, v8 ; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vid.v v16 ; CHECK-NEXT: srli a0, a0, 1 ; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: vid.v v8 -; CHECK-NEXT: vrsub.vx v24, v8, a0 -; CHECK-NEXT: vrgather.vv v15, v16, v24 -; CHECK-NEXT: vrgather.vv v14, v17, v24 -; CHECK-NEXT: vrgather.vv v13, v18, v24 -; CHECK-NEXT: vrgather.vv v12, v19, v24 -; CHECK-NEXT: vrgather.vv v11, v20, v24 -; CHECK-NEXT: vrgather.vv v10, v21, v24 -; CHECK-NEXT: vrgather.vv v9, v22, v24 -; CHECK-NEXT: vrgather.vv v8, v23, v24 +; CHECK-NEXT: vrsub.vx v24, v16, a0 +; CHECK-NEXT: vrgather.vv v23, v8, v24 +; CHECK-NEXT: vrgather.vv v22, v9, v24 +; CHECK-NEXT: vrgather.vv v21, v10, v24 +; CHECK-NEXT: vrgather.vv v20, v11, v24 +; CHECK-NEXT: vrgather.vv v19, v12, v24 +; CHECK-NEXT: vrgather.vv v18, v13, v24 +; CHECK-NEXT: vrgather.vv v17, v14, v24 +; CHECK-NEXT: vrgather.vv v16, v15, v24 +; CHECK-NEXT: vmv8r.v v8, v16 ; CHECK-NEXT: ret %res = call @llvm.vector.reverse.nxv32f16( %a) ret %res @@ -1807,10 +1777,10 @@ define @reverse_nxv4f32( %a) { ; CHECK-LABEL: reverse_nxv4f32: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a0, a0, 2 -; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma ; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vrsub.vx v12, v10, a0 ; CHECK-NEXT: vrgather.vv v11, v8, v12 ; CHECK-NEXT: vrgather.vv v10, v9, v12 @@ -1824,10 +1794,10 @@ define @reverse_nxv8f32( %a) { ; CHECK-LABEL: reverse_nxv8f32: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a0, a0, 2 -; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma ; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vrsub.vx v16, v12, a0 ; CHECK-NEXT: vrgather.vv v15, v8, v16 ; CHECK-NEXT: vrgather.vv v14, v9, v16 @@ -1842,21 +1812,21 @@ define @reverse_nxv8f32( %a) { define @reverse_nxv16f32( %a) { ; CHECK-LABEL: reverse_nxv16f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma -; CHECK-NEXT: vmv8r.v v16, v8 ; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; CHECK-NEXT: vid.v v16 ; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: vid.v v8 -; CHECK-NEXT: vrsub.vx v24, v8, a0 -; CHECK-NEXT: vrgather.vv v15, v16, v24 -; CHECK-NEXT: vrgather.vv v14, v17, v24 -; CHECK-NEXT: vrgather.vv v13, v18, v24 -; CHECK-NEXT: vrgather.vv v12, v19, v24 -; CHECK-NEXT: vrgather.vv v11, v20, v24 -; CHECK-NEXT: vrgather.vv v10, v21, v24 -; CHECK-NEXT: vrgather.vv v9, v22, v24 -; CHECK-NEXT: vrgather.vv v8, v23, v24 +; CHECK-NEXT: vrsub.vx v24, v16, a0 +; CHECK-NEXT: vrgather.vv v23, v8, v24 +; CHECK-NEXT: vrgather.vv v22, v9, v24 +; CHECK-NEXT: vrgather.vv v21, v10, v24 +; CHECK-NEXT: vrgather.vv v20, v11, v24 +; CHECK-NEXT: vrgather.vv v19, v12, v24 +; CHECK-NEXT: vrgather.vv v18, v13, v24 +; CHECK-NEXT: vrgather.vv v17, v14, v24 +; CHECK-NEXT: vrgather.vv v16, v15, v24 +; CHECK-NEXT: vmv8r.v v8, v16 ; CHECK-NEXT: ret %res = call @llvm.vector.reverse.nxv16f32( %a) ret %res @@ -1882,10 +1852,10 @@ define @reverse_nxv2f64( %a) { ; CHECK-LABEL: reverse_nxv2f64: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a0, a0, 3 -; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma ; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vrsub.vx v12, v10, a0 ; CHECK-NEXT: vrgather.vv v11, v8, v12 ; CHECK-NEXT: vrgather.vv v10, v9, v12 @@ -1899,10 +1869,10 @@ define @reverse_nxv4f64( %a) { ; CHECK-LABEL: reverse_nxv4f64: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a0, a0, 3 -; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma ; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vrsub.vx v16, v12, a0 ; CHECK-NEXT: vrgather.vv v15, v8, v16 ; CHECK-NEXT: vrgather.vv v14, v9, v16 @@ -1917,21 +1887,21 @@ define @reverse_nxv4f64( %a) { define @reverse_nxv8f64( %a) { ; CHECK-LABEL: reverse_nxv8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; CHECK-NEXT: vmv8r.v v16, v8 ; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; CHECK-NEXT: vid.v v16 ; CHECK-NEXT: srli a0, a0, 3 ; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: vid.v v8 -; CHECK-NEXT: vrsub.vx v24, v8, a0 -; CHECK-NEXT: vrgather.vv v15, v16, v24 -; CHECK-NEXT: vrgather.vv v14, v17, v24 -; CHECK-NEXT: vrgather.vv v13, v18, v24 -; CHECK-NEXT: vrgather.vv v12, v19, v24 -; CHECK-NEXT: vrgather.vv v11, v20, v24 -; CHECK-NEXT: vrgather.vv v10, v21, v24 -; CHECK-NEXT: vrgather.vv v9, v22, v24 -; CHECK-NEXT: vrgather.vv v8, v23, v24 +; CHECK-NEXT: vrsub.vx v24, v16, a0 +; CHECK-NEXT: vrgather.vv v23, v8, v24 +; CHECK-NEXT: vrgather.vv v22, v9, v24 +; CHECK-NEXT: vrgather.vv v21, v10, v24 +; CHECK-NEXT: vrgather.vv v20, v11, v24 +; CHECK-NEXT: vrgather.vv v19, v12, v24 +; CHECK-NEXT: vrgather.vv v18, v13, v24 +; CHECK-NEXT: vrgather.vv v17, v14, v24 +; CHECK-NEXT: vrgather.vv v16, v15, v24 +; CHECK-NEXT: vmv8r.v v8, v16 ; CHECK-NEXT: ret %res = call @llvm.vector.reverse.nxv8f64( %a) ret %res @@ -1943,19 +1913,18 @@ define @reverse_nxv3i64( %a) { ; CHECK-LABEL: reverse_nxv3i64: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a0, a0, 3 -; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma ; CHECK-NEXT: vid.v v12 -; CHECK-NEXT: vrsub.vx v14, v12, a0 -; CHECK-NEXT: vrgather.vv v13, v10, v14 -; CHECK-NEXT: vrgather.vv v10, v9, v14 -; CHECK-NEXT: vmv.v.v v12, v13 -; CHECK-NEXT: vrgather.vv v15, v8, v14 -; CHECK-NEXT: vmv.v.v v13, v10 -; CHECK-NEXT: vrgather.vv v8, v11, v14 -; CHECK-NEXT: vmv.v.v v14, v15 -; CHECK-NEXT: vmv4r.v v8, v12 +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vrsub.vx v12, v12, a0 +; CHECK-NEXT: vrgather.vv v15, v8, v12 +; CHECK-NEXT: vrgather.vv v14, v9, v12 +; CHECK-NEXT: vrgather.vv v13, v10, v12 +; CHECK-NEXT: vrgather.vv v8, v11, v12 +; CHECK-NEXT: vmv.v.v v9, v14 +; CHECK-NEXT: vmv.v.v v10, v15 +; CHECK-NEXT: vmv.v.v v8, v13 ; CHECK-NEXT: ret %res = call @llvm.vector.reverse.nxv3i64( %a) ret %res @@ -1969,19 +1938,18 @@ define @reverse_nxv6i64( %a) { ; CHECK-NEXT: vid.v v16 ; CHECK-NEXT: srli a0, a0, 3 ; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: vrsub.vx v24, v16, a0 -; CHECK-NEXT: vrgather.vv v21, v10, v24 -; CHECK-NEXT: vrgather.vv v19, v12, v24 -; CHECK-NEXT: vrgather.vv v18, v13, v24 -; CHECK-NEXT: vrgather.vv v20, v11, v24 -; CHECK-NEXT: vmv2r.v v16, v18 -; CHECK-NEXT: vmv2r.v v18, v20 -; CHECK-NEXT: vrgather.vv v23, v8, v24 -; CHECK-NEXT: vrgather.vv v22, v9, v24 -; CHECK-NEXT: vrgather.vv v9, v14, v24 -; CHECK-NEXT: vrgather.vv v8, v15, v24 -; CHECK-NEXT: vmv2r.v v20, v22 -; CHECK-NEXT: vmv8r.v v8, v16 +; CHECK-NEXT: vrsub.vx v16, v16, a0 +; CHECK-NEXT: vrgather.vv v23, v8, v16 +; CHECK-NEXT: vrgather.vv v22, v9, v16 +; CHECK-NEXT: vrgather.vv v21, v10, v16 +; CHECK-NEXT: vrgather.vv v20, v11, v16 +; CHECK-NEXT: vrgather.vv v9, v14, v16 +; CHECK-NEXT: vrgather.vv v19, v12, v16 +; CHECK-NEXT: vrgather.vv v18, v13, v16 +; CHECK-NEXT: vrgather.vv v8, v15, v16 +; CHECK-NEXT: vmv2r.v v10, v20 +; CHECK-NEXT: vmv2r.v v12, v22 +; CHECK-NEXT: vmv2r.v v8, v18 ; CHECK-NEXT: ret %res = call @llvm.vector.reverse.nxv6i64( %a) ret %res @@ -2007,23 +1975,23 @@ define @reverse_nxv12i64( %a) { ; RV32-NEXT: vid.v v20 ; RV32-NEXT: srli a1, a0, 3 ; RV32-NEXT: addi a1, a1, -1 -; RV32-NEXT: vrsub.vx v7, v20, a1 -; RV32-NEXT: vrgather.vv v31, v12, v7 -; RV32-NEXT: vrgather.vv v23, v8, v7 -; RV32-NEXT: vrgather.vv v30, v13, v7 -; RV32-NEXT: vrgather.vv v22, v9, v7 -; RV32-NEXT: vrgather.vv v29, v14, v7 -; RV32-NEXT: vrgather.vv v21, v10, v7 -; RV32-NEXT: vrgather.vv v28, v15, v7 -; RV32-NEXT: vrgather.vv v20, v11, v7 +; RV32-NEXT: vrsub.vx v20, v20, a1 ; RV32-NEXT: addi a1, sp, 64 ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, a1, a0 -; RV32-NEXT: vrgather.vv v27, v16, v7 -; RV32-NEXT: vs4r.v v20, (a0) -; RV32-NEXT: vrgather.vv v26, v17, v7 -; RV32-NEXT: vrgather.vv v25, v18, v7 -; RV32-NEXT: vrgather.vv v24, v19, v7 +; RV32-NEXT: vrgather.vv v31, v12, v20 +; RV32-NEXT: vrgather.vv v30, v13, v20 +; RV32-NEXT: vrgather.vv v29, v14, v20 +; RV32-NEXT: vrgather.vv v28, v15, v20 +; RV32-NEXT: vrgather.vv v27, v16, v20 +; RV32-NEXT: vrgather.vv v26, v17, v20 +; RV32-NEXT: vrgather.vv v25, v18, v20 +; RV32-NEXT: vrgather.vv v24, v19, v20 +; RV32-NEXT: vrgather.vv v15, v8, v20 +; RV32-NEXT: vrgather.vv v14, v9, v20 +; RV32-NEXT: vrgather.vv v13, v10, v20 +; RV32-NEXT: vrgather.vv v12, v11, v20 +; RV32-NEXT: vs4r.v v12, (a0) ; RV32-NEXT: vs8r.v v24, (a1) ; RV32-NEXT: vl8re64.v v16, (a0) ; RV32-NEXT: vl8re64.v v8, (a1) @@ -2056,23 +2024,23 @@ define @reverse_nxv12i64( %a) { ; RV64-NEXT: vid.v v20 ; RV64-NEXT: srli a1, a0, 3 ; RV64-NEXT: addi a1, a1, -1 -; RV64-NEXT: vrsub.vx v7, v20, a1 -; RV64-NEXT: vrgather.vv v31, v12, v7 -; RV64-NEXT: vrgather.vv v23, v8, v7 -; RV64-NEXT: vrgather.vv v30, v13, v7 -; RV64-NEXT: vrgather.vv v22, v9, v7 -; RV64-NEXT: vrgather.vv v29, v14, v7 -; RV64-NEXT: vrgather.vv v21, v10, v7 -; RV64-NEXT: vrgather.vv v28, v15, v7 -; RV64-NEXT: vrgather.vv v20, v11, v7 +; RV64-NEXT: vrsub.vx v20, v20, a1 ; RV64-NEXT: addi a1, sp, 64 ; RV64-NEXT: slli a0, a0, 3 ; RV64-NEXT: add a0, a1, a0 -; RV64-NEXT: vrgather.vv v27, v16, v7 -; RV64-NEXT: vs4r.v v20, (a0) -; RV64-NEXT: vrgather.vv v26, v17, v7 -; RV64-NEXT: vrgather.vv v25, v18, v7 -; RV64-NEXT: vrgather.vv v24, v19, v7 +; RV64-NEXT: vrgather.vv v31, v12, v20 +; RV64-NEXT: vrgather.vv v30, v13, v20 +; RV64-NEXT: vrgather.vv v29, v14, v20 +; RV64-NEXT: vrgather.vv v28, v15, v20 +; RV64-NEXT: vrgather.vv v27, v16, v20 +; RV64-NEXT: vrgather.vv v26, v17, v20 +; RV64-NEXT: vrgather.vv v25, v18, v20 +; RV64-NEXT: vrgather.vv v24, v19, v20 +; RV64-NEXT: vrgather.vv v15, v8, v20 +; RV64-NEXT: vrgather.vv v14, v9, v20 +; RV64-NEXT: vrgather.vv v13, v10, v20 +; RV64-NEXT: vrgather.vv v12, v11, v20 +; RV64-NEXT: vs4r.v v12, (a0) ; RV64-NEXT: vs8r.v v24, (a1) ; RV64-NEXT: vl8re64.v v16, (a0) ; RV64-NEXT: vl8re64.v v8, (a1) diff --git a/llvm/test/CodeGen/RISCV/rvv/splat-vector-split-i64-vl-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/splat-vector-split-i64-vl-sdnode.ll index 26325328e5671..904ea6a6090a5 100644 --- a/llvm/test/CodeGen/RISCV/rvv/splat-vector-split-i64-vl-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/splat-vector-split-i64-vl-sdnode.ll @@ -12,53 +12,53 @@ define i32 @splat_vector_split_i64() { ; CHECK-NEXT: vmv.v.i v10, 3 ; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma ; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: lui a1, 1044480 -; CHECK-NEXT: li a2, 56 -; CHECK-NEXT: li a3, 40 -; CHECK-NEXT: lui a4, 16 -; CHECK-NEXT: lui a0, 4080 +; CHECK-NEXT: lui a0, 1044480 +; CHECK-NEXT: li a1, 56 +; CHECK-NEXT: li a2, 40 +; CHECK-NEXT: lui a3, 16 +; CHECK-NEXT: lui a4, 4080 ; CHECK-NEXT: addi a5, sp, 8 -; CHECK-NEXT: sw a1, 8(sp) -; CHECK-NEXT: sw zero, 12(sp) -; CHECK-NEXT: addi a1, a4, -256 ; CHECK-NEXT: vsetivli zero, 4, e64, m2, tu, ma ; CHECK-NEXT: vslideup.vi v8, v10, 3 -; CHECK-NEXT: vsetvli a4, zero, e64, m2, ta, ma -; CHECK-NEXT: vlse64.v v10, (a5), zero -; CHECK-NEXT: vsrl.vx v12, v8, a2 -; CHECK-NEXT: vsrl.vx v14, v8, a3 +; CHECK-NEXT: sw a0, 8(sp) +; CHECK-NEXT: sw zero, 12(sp) +; CHECK-NEXT: addi a0, a3, -256 +; CHECK-NEXT: vsetvli a3, zero, e64, m2, ta, ma +; CHECK-NEXT: vsrl.vx v12, v8, a1 +; CHECK-NEXT: vsrl.vx v14, v8, a2 ; CHECK-NEXT: vsrl.vi v16, v8, 24 -; CHECK-NEXT: vsll.vx v18, v8, a2 -; CHECK-NEXT: vand.vx v14, v14, a1 +; CHECK-NEXT: vlse64.v v10, (a5), zero +; CHECK-NEXT: vsll.vx v18, v8, a1 +; CHECK-NEXT: vand.vx v14, v14, a0 ; CHECK-NEXT: vor.vv v14, v14, v12 -; CHECK-NEXT: vand.vx v12, v8, a1 -; CHECK-NEXT: vsll.vx v12, v12, a3 +; CHECK-NEXT: vand.vx v12, v8, a0 +; CHECK-NEXT: vsll.vx v12, v12, a2 ; CHECK-NEXT: vor.vv v12, v18, v12 ; CHECK-NEXT: vsrl.vi v18, v8, 8 -; CHECK-NEXT: vand.vx v16, v16, a0 +; CHECK-NEXT: vand.vx v16, v16, a4 ; CHECK-NEXT: vand.vv v18, v18, v10 ; CHECK-NEXT: vor.vv v16, v18, v16 -; CHECK-NEXT: lui a1, 61681 -; CHECK-NEXT: lui a2, 209715 -; CHECK-NEXT: lui a3, 349525 -; CHECK-NEXT: addi a1, a1, -241 -; CHECK-NEXT: addi a2, a2, 819 -; CHECK-NEXT: addi a3, a3, 1365 -; CHECK-NEXT: vor.vv v14, v16, v14 -; CHECK-NEXT: vsetvli a4, zero, e32, m2, ta, ma -; CHECK-NEXT: vmv.v.x v16, a1 -; CHECK-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; CHECK-NEXT: lui a0, 61681 +; CHECK-NEXT: lui a1, 209715 +; CHECK-NEXT: lui a2, 349525 +; CHECK-NEXT: addi a0, a0, -241 +; CHECK-NEXT: addi a1, a1, 819 +; CHECK-NEXT: addi a2, a2, 1365 ; CHECK-NEXT: vand.vv v10, v8, v10 -; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vand.vx v8, v8, a4 +; CHECK-NEXT: vor.vv v14, v16, v14 +; CHECK-NEXT: vsetvli a3, zero, e32, m2, ta, ma +; CHECK-NEXT: vmv.v.x v16, a0 +; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma ; CHECK-NEXT: vsll.vi v8, v8, 24 ; CHECK-NEXT: vsll.vi v10, v10, 8 ; CHECK-NEXT: vor.vv v8, v8, v10 ; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma -; CHECK-NEXT: vmv.v.x v10, a2 +; CHECK-NEXT: vmv.v.x v10, a1 ; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma ; CHECK-NEXT: vor.vv v8, v12, v8 ; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma -; CHECK-NEXT: vmv.v.x v12, a3 +; CHECK-NEXT: vmv.v.x v12, a2 ; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma ; CHECK-NEXT: vor.vv v8, v8, v14 ; CHECK-NEXT: vsrl.vi v14, v8, 4 diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll index 5c04a09c9953b..8ca4fd24756d9 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll @@ -189,10 +189,10 @@ define {<2 x i32>, <2 x i32>, <2 x i32>} @vector_deinterleave3_v2i32_v6i32(<6 x ; CHECK-NEXT: vsetivli zero, 2, e32, m2, ta, ma ; CHECK-NEXT: vslidedown.vi v10, v8, 4 ; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: vmv1r.v v9, v10 ; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma ; CHECK-NEXT: vslideup.vx v8, v12, a0 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vmv1r.v v9, v10 ; CHECK-NEXT: vs2r.v v8, (a0) ; CHECK-NEXT: vsetvli a1, zero, e32, mf2, ta, ma ; CHECK-NEXT: vlseg3e32.v v8, (a0) @@ -252,13 +252,13 @@ define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @vector_deinterle ; CHECK-NEXT: vslidedown.vi v10, v8, 8 ; CHECK-NEXT: srli a1, a0, 2 ; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: vmv1r.v v9, v10 ; CHECK-NEXT: vsetvli a2, zero, e16, mf2, ta, ma ; CHECK-NEXT: vslideup.vx v13, v12, a0 ; CHECK-NEXT: vslideup.vx v8, v14, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-NEXT: vslideup.vx v8, v13, a1 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vmv1r.v v9, v10 ; CHECK-NEXT: vs2r.v v8, (a0) ; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma ; CHECK-NEXT: vlseg5e16.v v8, (a0) @@ -294,8 +294,8 @@ define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @vecto ; CHECK-NEXT: vslideup.vx v12, v10, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-NEXT: vslideup.vx v8, v15, a1 -; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vmv1r.v v9, v12 +; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs2r.v v8, (a0) ; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma ; CHECK-NEXT: vlseg6e16.v v8, (a0) @@ -537,10 +537,10 @@ define {<2 x float>, <2 x float>, <2 x float>} @vector_deinterleave3_v6f32_v2f32 ; CHECK-NEXT: vsetivli zero, 2, e32, m2, ta, ma ; CHECK-NEXT: vslidedown.vi v10, v8, 4 ; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: vmv1r.v v9, v10 ; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma ; CHECK-NEXT: vslideup.vx v8, v12, a0 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vmv1r.v v9, v10 ; CHECK-NEXT: vs2r.v v8, (a0) ; CHECK-NEXT: vsetvli a1, zero, e32, mf2, ta, ma ; CHECK-NEXT: vlseg3e32.v v8, (a0) @@ -608,13 +608,13 @@ define {<2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>} @vector_dein ; CHECK-NEXT: vslidedown.vi v10, v8, 8 ; CHECK-NEXT: srli a1, a0, 2 ; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: vmv1r.v v9, v10 ; CHECK-NEXT: vsetvli a2, zero, e16, mf2, ta, ma ; CHECK-NEXT: vslideup.vx v13, v12, a0 ; CHECK-NEXT: vslideup.vx v8, v14, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-NEXT: vslideup.vx v8, v13, a1 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vmv1r.v v9, v10 ; CHECK-NEXT: vs2r.v v8, (a0) ; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma ; CHECK-NEXT: vlseg5e16.v v8, (a0) @@ -654,8 +654,8 @@ define {<2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>} ; CHECK-NEXT: vslideup.vx v12, v10, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-NEXT: vslideup.vx v8, v15, a1 -; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vmv1r.v v9, v12 +; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs2r.v v8, (a0) ; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma ; CHECK-NEXT: vlseg6e16.v v8, (a0) @@ -681,26 +681,23 @@ define {<1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v9, v8, 3 +; CHECK-NEXT: vslidedown.vi v10, v8, 3 ; CHECK-NEXT: vslidedown.vi v11, v8, 2 ; CHECK-NEXT: vslidedown.vi v12, v8, 1 -; CHECK-NEXT: vmv1r.v v10, v8 ; CHECK-NEXT: vslidedown.vi v13, v8, 5 +; CHECK-NEXT: vslidedown.vi v9, v8, 4 ; CHECK-NEXT: vslidedown.vi v14, v8, 6 ; CHECK-NEXT: srli a1, a0, 2 ; CHECK-NEXT: srli a0, a0, 3 ; CHECK-NEXT: vsetvli a2, zero, e16, mf2, ta, ma -; CHECK-NEXT: vslideup.vx v11, v9, a0 -; CHECK-NEXT: vslideup.vx v10, v12, a0 -; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma -; CHECK-NEXT: vslideup.vx v10, v11, a1 -; CHECK-NEXT: vslidedown.vi v11, v8, 4 -; CHECK-NEXT: vsetvli a2, zero, e16, mf2, ta, ma -; CHECK-NEXT: vslideup.vx v11, v13, a0 +; CHECK-NEXT: vslideup.vx v11, v10, a0 +; CHECK-NEXT: vslideup.vx v8, v12, a0 +; CHECK-NEXT: vslideup.vx v9, v13, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; CHECK-NEXT: vslideup.vx v11, v14, a1 +; CHECK-NEXT: vslideup.vx v8, v11, a1 +; CHECK-NEXT: vslideup.vx v9, v14, a1 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs2r.v v10, (a0) +; CHECK-NEXT: vs2r.v v8, (a0) ; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma ; CHECK-NEXT: vlseg7e16.v v8, (a0) ; CHECK-NEXT: csrr a0, vlenb @@ -728,22 +725,20 @@ define {<1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, ; CHECK-NEXT: vslidedown.vi v10, v8, 7 ; CHECK-NEXT: vslidedown.vi v11, v8, 6 ; CHECK-NEXT: vslidedown.vi v12, v8, 5 +; CHECK-NEXT: vslidedown.vi v9, v8, 4 +; CHECK-NEXT: vslidedown.vi v13, v8, 3 +; CHECK-NEXT: vslidedown.vi v14, v8, 2 +; CHECK-NEXT: vslidedown.vi v15, v8, 1 ; CHECK-NEXT: srli a1, a0, 2 ; CHECK-NEXT: srli a0, a0, 3 -; CHECK-NEXT: vslidedown.vi v9, v8, 4 ; CHECK-NEXT: vsetvli a2, zero, e16, mf2, ta, ma ; CHECK-NEXT: vslideup.vx v11, v10, a0 ; CHECK-NEXT: vslideup.vx v9, v12, a0 -; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma -; CHECK-NEXT: vslideup.vx v9, v11, a1 -; CHECK-NEXT: vslidedown.vi v10, v8, 3 -; CHECK-NEXT: vslidedown.vi v11, v8, 2 -; CHECK-NEXT: vslidedown.vi v12, v8, 1 -; CHECK-NEXT: vsetvli a2, zero, e16, mf2, ta, ma -; CHECK-NEXT: vslideup.vx v11, v10, a0 -; CHECK-NEXT: vslideup.vx v8, v12, a0 +; CHECK-NEXT: vslideup.vx v14, v13, a0 +; CHECK-NEXT: vslideup.vx v8, v15, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; CHECK-NEXT: vslideup.vx v8, v11, a1 +; CHECK-NEXT: vslideup.vx v9, v11, a1 +; CHECK-NEXT: vslideup.vx v8, v14, a1 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs2r.v v8, (a0) ; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll index bd1a3f8e316bb..90ea6ccc52bab 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll @@ -593,29 +593,30 @@ define {, , , , } @vector_deinterleave_nxv64i1_nxv ; V-NEXT: vmv.v.i v24, 0 ; V-NEXT: vmerge.vim v16, v24, 1, v0 ; V-NEXT: vmv1r.v v0, v8 -; V-NEXT: vmerge.vim v24, v24, 1, v0 +; V-NEXT: vmerge.vim v8, v24, 1, v0 ; V-NEXT: vsetvli a0, zero, e8, m4, ta, ma -; V-NEXT: vnsrl.wi v8, v16, 0 +; V-NEXT: vnsrl.wi v24, v16, 0 +; V-NEXT: vnsrl.wi v28, v8, 0 ; V-NEXT: vnsrl.wi v0, v16, 8 -; V-NEXT: vnsrl.wi v12, v24, 0 -; V-NEXT: vnsrl.wi v4, v24, 8 +; V-NEXT: vnsrl.wi v4, v8, 8 ; V-NEXT: vsetvli a0, zero, e8, m8, ta, ma -; V-NEXT: vmsne.vi v16, v8, 0 +; V-NEXT: vmsne.vi v9, v24, 0 ; V-NEXT: vmsne.vi v8, v0, 0 -; V-NEXT: vmv1r.v v0, v16 +; V-NEXT: vmv1r.v v0, v9 ; V-NEXT: ret ; ; ZIP-LABEL: vector_deinterleave_nxv64i1_nxv128i1: @@ -195,8 +195,8 @@ define {, } @vector_deinterleave_nxv64i1_nxv ; ZIP-NEXT: vmerge.vim v24, v24, 1, v0 ; ZIP-NEXT: vsetvli a0, zero, e8, m4, ta, ma ; ZIP-NEXT: ri.vunzip2a.vv v12, v16, v20 -; ZIP-NEXT: ri.vunzip2b.vv v4, v16, v20 ; ZIP-NEXT: ri.vunzip2a.vv v8, v24, v28 +; ZIP-NEXT: ri.vunzip2b.vv v4, v16, v20 ; ZIP-NEXT: ri.vunzip2b.vv v0, v24, v28 ; ZIP-NEXT: vsetvli a0, zero, e8, m8, ta, ma ; ZIP-NEXT: vmsne.vi v16, v8, 0 @@ -213,8 +213,8 @@ define {, } @vector_deinterleave_nxv64i8_nxv ; V-NEXT: vsetvli a0, zero, e8, m4, ta, ma ; V-NEXT: vmv8r.v v24, v8 ; V-NEXT: vnsrl.wi v8, v24, 0 -; V-NEXT: vnsrl.wi v0, v24, 8 ; V-NEXT: vnsrl.wi v12, v16, 0 +; V-NEXT: vnsrl.wi v0, v24, 8 ; V-NEXT: vnsrl.wi v4, v16, 8 ; V-NEXT: vmv8r.v v16, v0 ; V-NEXT: ret @@ -223,8 +223,8 @@ define {, } @vector_deinterleave_nxv64i8_nxv ; ZIP: # %bb.0: ; ZIP-NEXT: vsetvli a0, zero, e8, m4, ta, ma ; ZIP-NEXT: ri.vunzip2a.vv v28, v16, v20 -; ZIP-NEXT: ri.vunzip2b.vv v4, v16, v20 ; ZIP-NEXT: ri.vunzip2a.vv v24, v8, v12 +; ZIP-NEXT: ri.vunzip2b.vv v4, v16, v20 ; ZIP-NEXT: ri.vunzip2b.vv v0, v8, v12 ; ZIP-NEXT: vmv8r.v v8, v24 ; ZIP-NEXT: vmv8r.v v16, v0 @@ -239,8 +239,8 @@ define {, } @vector_deinterleave_nxv32i16_ ; V-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; V-NEXT: vmv8r.v v24, v8 ; V-NEXT: vnsrl.wi v8, v24, 0 -; V-NEXT: vnsrl.wi v0, v24, 16 ; V-NEXT: vnsrl.wi v12, v16, 0 +; V-NEXT: vnsrl.wi v0, v24, 16 ; V-NEXT: vnsrl.wi v4, v16, 16 ; V-NEXT: vmv8r.v v16, v0 ; V-NEXT: ret @@ -249,8 +249,8 @@ define {, } @vector_deinterleave_nxv32i16_ ; ZIP: # %bb.0: ; ZIP-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZIP-NEXT: ri.vunzip2a.vv v28, v16, v20 -; ZIP-NEXT: ri.vunzip2b.vv v4, v16, v20 ; ZIP-NEXT: ri.vunzip2a.vv v24, v8, v12 +; ZIP-NEXT: ri.vunzip2b.vv v4, v16, v20 ; ZIP-NEXT: ri.vunzip2b.vv v0, v8, v12 ; ZIP-NEXT: vmv8r.v v8, v24 ; ZIP-NEXT: vmv8r.v v16, v0 @@ -262,22 +262,22 @@ ret {, } %retval define {, } @vector_deinterleave_nxv16i32_nxvv32i32( %vec) { ; V-LABEL: vector_deinterleave_nxv16i32_nxvv32i32: ; V: # %bb.0: -; V-NEXT: vsetvli a0, zero, e32, m4, ta, ma -; V-NEXT: vmv8r.v v24, v16 ; V-NEXT: li a0, 32 -; V-NEXT: vnsrl.wx v20, v24, a0 -; V-NEXT: vnsrl.wx v16, v8, a0 -; V-NEXT: vnsrl.wi v0, v8, 0 -; V-NEXT: vnsrl.wi v4, v24, 0 -; V-NEXT: vmv8r.v v8, v0 +; V-NEXT: vsetvli a1, zero, e32, m4, ta, ma +; V-NEXT: vnsrl.wi v24, v8, 0 +; V-NEXT: vnsrl.wx v4, v16, a0 +; V-NEXT: vnsrl.wx v0, v8, a0 +; V-NEXT: vnsrl.wi v28, v16, 0 +; V-NEXT: vmv8r.v v8, v24 +; V-NEXT: vmv8r.v v16, v0 ; V-NEXT: ret ; ; ZIP-LABEL: vector_deinterleave_nxv16i32_nxvv32i32: ; ZIP: # %bb.0: ; ZIP-NEXT: vsetvli a0, zero, e32, m4, ta, ma ; ZIP-NEXT: ri.vunzip2a.vv v28, v16, v20 -; ZIP-NEXT: ri.vunzip2b.vv v4, v16, v20 ; ZIP-NEXT: ri.vunzip2a.vv v24, v8, v12 +; ZIP-NEXT: ri.vunzip2b.vv v4, v16, v20 ; ZIP-NEXT: ri.vunzip2b.vv v0, v8, v12 ; ZIP-NEXT: vmv8r.v v8, v24 ; ZIP-NEXT: vmv8r.v v16, v0 @@ -337,8 +337,8 @@ define {, } @vector_deinterleave_nxv8i64_nxv ; ZIP: # %bb.0: ; ZIP-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; ZIP-NEXT: ri.vunzip2a.vv v28, v16, v20 -; ZIP-NEXT: ri.vunzip2b.vv v4, v16, v20 ; ZIP-NEXT: ri.vunzip2a.vv v24, v8, v12 +; ZIP-NEXT: ri.vunzip2b.vv v4, v16, v20 ; ZIP-NEXT: ri.vunzip2b.vv v0, v8, v12 ; ZIP-NEXT: vmv8r.v v8, v24 ; ZIP-NEXT: vmv8r.v v16, v0 @@ -601,54 +601,54 @@ define {, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , } @vector_deinterleave_nxv ; V-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; V-NEXT: vmv8r.v v24, v8 ; V-NEXT: vnsrl.wi v8, v24, 0 -; V-NEXT: vnsrl.wi v0, v24, 16 ; V-NEXT: vnsrl.wi v12, v16, 0 +; V-NEXT: vnsrl.wi v0, v24, 16 ; V-NEXT: vnsrl.wi v4, v16, 16 ; V-NEXT: vmv8r.v v16, v0 ; V-NEXT: ret @@ -1794,8 +1802,8 @@ define {, } @vector_deinterleave_nxv ; ZIP: # %bb.0: ; ZIP-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZIP-NEXT: ri.vunzip2a.vv v28, v16, v20 -; ZIP-NEXT: ri.vunzip2b.vv v4, v16, v20 ; ZIP-NEXT: ri.vunzip2a.vv v24, v8, v12 +; ZIP-NEXT: ri.vunzip2b.vv v4, v16, v20 ; ZIP-NEXT: ri.vunzip2b.vv v0, v8, v12 ; ZIP-NEXT: vmv8r.v v8, v24 ; ZIP-NEXT: vmv8r.v v16, v0 @@ -1810,8 +1818,8 @@ define {, } @vector_deinterleave_nxv32f1 ; V-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; V-NEXT: vmv8r.v v24, v8 ; V-NEXT: vnsrl.wi v8, v24, 0 -; V-NEXT: vnsrl.wi v0, v24, 16 ; V-NEXT: vnsrl.wi v12, v16, 0 +; V-NEXT: vnsrl.wi v0, v24, 16 ; V-NEXT: vnsrl.wi v4, v16, 16 ; V-NEXT: vmv8r.v v16, v0 ; V-NEXT: ret @@ -1820,8 +1828,8 @@ define {, } @vector_deinterleave_nxv32f1 ; ZIP: # %bb.0: ; ZIP-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZIP-NEXT: ri.vunzip2a.vv v28, v16, v20 -; ZIP-NEXT: ri.vunzip2b.vv v4, v16, v20 ; ZIP-NEXT: ri.vunzip2a.vv v24, v8, v12 +; ZIP-NEXT: ri.vunzip2b.vv v4, v16, v20 ; ZIP-NEXT: ri.vunzip2b.vv v0, v8, v12 ; ZIP-NEXT: vmv8r.v v8, v24 ; ZIP-NEXT: vmv8r.v v16, v0 @@ -1833,22 +1841,22 @@ ret {, } %retval define {, } @vector_deinterleave_nxv16f32_nxv32f32( %vec) { ; V-LABEL: vector_deinterleave_nxv16f32_nxv32f32: ; V: # %bb.0: -; V-NEXT: vsetvli a0, zero, e32, m4, ta, ma -; V-NEXT: vmv8r.v v24, v16 ; V-NEXT: li a0, 32 -; V-NEXT: vnsrl.wx v20, v24, a0 -; V-NEXT: vnsrl.wx v16, v8, a0 -; V-NEXT: vnsrl.wi v0, v8, 0 -; V-NEXT: vnsrl.wi v4, v24, 0 -; V-NEXT: vmv8r.v v8, v0 +; V-NEXT: vsetvli a1, zero, e32, m4, ta, ma +; V-NEXT: vnsrl.wi v24, v8, 0 +; V-NEXT: vnsrl.wx v4, v16, a0 +; V-NEXT: vnsrl.wx v0, v8, a0 +; V-NEXT: vnsrl.wi v28, v16, 0 +; V-NEXT: vmv8r.v v8, v24 +; V-NEXT: vmv8r.v v16, v0 ; V-NEXT: ret ; ; ZIP-LABEL: vector_deinterleave_nxv16f32_nxv32f32: ; ZIP: # %bb.0: ; ZIP-NEXT: vsetvli a0, zero, e32, m4, ta, ma ; ZIP-NEXT: ri.vunzip2a.vv v28, v16, v20 -; ZIP-NEXT: ri.vunzip2b.vv v4, v16, v20 ; ZIP-NEXT: ri.vunzip2a.vv v24, v8, v12 +; ZIP-NEXT: ri.vunzip2b.vv v4, v16, v20 ; ZIP-NEXT: ri.vunzip2b.vv v0, v8, v12 ; ZIP-NEXT: vmv8r.v v8, v24 ; ZIP-NEXT: vmv8r.v v16, v0 @@ -1908,8 +1916,8 @@ define {, } @vector_deinterleave_nxv8f ; ZIP: # %bb.0: ; ZIP-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; ZIP-NEXT: ri.vunzip2a.vv v28, v16, v20 -; ZIP-NEXT: ri.vunzip2b.vv v4, v16, v20 ; ZIP-NEXT: ri.vunzip2a.vv v24, v8, v12 +; ZIP-NEXT: ri.vunzip2b.vv v4, v16, v20 ; ZIP-NEXT: ri.vunzip2b.vv v0, v8, v12 ; ZIP-NEXT: vmv8r.v v8, v24 ; ZIP-NEXT: vmv8r.v v16, v0 @@ -2384,9 +2392,9 @@ define {, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , @vector_interleave_v4i64_v2i64(<2 x i64> %a, <2 x i64> %b) { ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vmv1r.v v10, v9 ; CHECK-NEXT: lui a0, 12304 -; CHECK-NEXT: addi a0, a0, 512 ; CHECK-NEXT: vslideup.vi v8, v10, 2 +; CHECK-NEXT: addi a0, a0, 512 ; CHECK-NEXT: vmv.s.x v10, a0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vsext.vf2 v12, v10 @@ -146,8 +146,8 @@ define <4 x i64> @vector_interleave_v4i64_v2i64(<2 x i64> %a, <2 x i64> %b) { ; ZVBB-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; ZVBB-NEXT: vmv1r.v v10, v9 ; ZVBB-NEXT: lui a0, 12304 -; ZVBB-NEXT: addi a0, a0, 512 ; ZVBB-NEXT: vslideup.vi v8, v10, 2 +; ZVBB-NEXT: addi a0, a0, 512 ; ZVBB-NEXT: vmv.s.x v10, a0 ; ZVBB-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVBB-NEXT: vsext.vf2 v12, v10 @@ -176,12 +176,12 @@ define <6 x i32> @vector_interleave3_v6i32_v2i32(<2 x i32> %a, <2 x i32> %b, <2 ; CHECK-NEXT: sub sp, sp, a0 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: srli a1, a1, 1 ; CHECK-NEXT: vsetvli a2, zero, e32, mf2, ta, ma ; CHECK-NEXT: vsseg3e32.v v8, (a0) +; CHECK-NEXT: srli a1, a1, 1 ; CHECK-NEXT: add a2, a0, a1 -; CHECK-NEXT: vle32.v v9, (a2) ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vle32.v v9, (a2) ; CHECK-NEXT: add a1, a2, a1 ; CHECK-NEXT: vle32.v v10, (a1) ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma @@ -202,12 +202,12 @@ define <6 x i32> @vector_interleave3_v6i32_v2i32(<2 x i32> %a, <2 x i32> %b, <2 ; ZVBB-NEXT: sub sp, sp, a0 ; ZVBB-NEXT: addi a0, sp, 16 ; ZVBB-NEXT: csrr a1, vlenb -; ZVBB-NEXT: srli a1, a1, 1 ; ZVBB-NEXT: vsetvli a2, zero, e32, mf2, ta, ma ; ZVBB-NEXT: vsseg3e32.v v8, (a0) +; ZVBB-NEXT: srli a1, a1, 1 ; ZVBB-NEXT: add a2, a0, a1 -; ZVBB-NEXT: vle32.v v9, (a2) ; ZVBB-NEXT: vle32.v v8, (a0) +; ZVBB-NEXT: vle32.v v9, (a2) ; ZVBB-NEXT: add a1, a2, a1 ; ZVBB-NEXT: vle32.v v10, (a1) ; ZVBB-NEXT: vsetivli zero, 4, e32, m1, ta, ma @@ -228,12 +228,12 @@ define <6 x i32> @vector_interleave3_v6i32_v2i32(<2 x i32> %a, <2 x i32> %b, <2 ; ZIP-NEXT: sub sp, sp, a0 ; ZIP-NEXT: addi a0, sp, 16 ; ZIP-NEXT: csrr a1, vlenb -; ZIP-NEXT: srli a1, a1, 1 ; ZIP-NEXT: vsetvli a2, zero, e32, mf2, ta, ma ; ZIP-NEXT: vsseg3e32.v v8, (a0) +; ZIP-NEXT: srli a1, a1, 1 ; ZIP-NEXT: add a2, a0, a1 -; ZIP-NEXT: vle32.v v9, (a2) ; ZIP-NEXT: vle32.v v8, (a0) +; ZIP-NEXT: vle32.v v9, (a2) ; ZIP-NEXT: add a1, a2, a1 ; ZIP-NEXT: vle32.v v10, (a1) ; ZIP-NEXT: vsetivli zero, 4, e32, m1, ta, ma @@ -258,19 +258,19 @@ define <8 x i32> @vector_interleave4_v8i32_v2i32(<2 x i32> %a, <2 x i32> %b, <2 ; CHECK-NEXT: sub sp, sp, a0 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: vsetvli a2, zero, e32, mf2, ta, ma +; CHECK-NEXT: vsseg4e32.v v8, (a0) ; CHECK-NEXT: srli a1, a1, 1 ; CHECK-NEXT: add a2, a0, a1 -; CHECK-NEXT: vsetvli a3, zero, e32, mf2, ta, ma -; CHECK-NEXT: vsseg4e32.v v8, (a0) -; CHECK-NEXT: add a3, a2, a1 -; CHECK-NEXT: add a1, a3, a1 -; CHECK-NEXT: vle32.v v10, (a3) +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: add a0, a2, a1 ; CHECK-NEXT: vle32.v v9, (a2) +; CHECK-NEXT: add a1, a0, a1 +; CHECK-NEXT: vle32.v v10, (a0) ; CHECK-NEXT: vle32.v v11, (a1) -; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vslideup.vi v10, v11, 2 ; CHECK-NEXT: vslideup.vi v8, v9, 2 +; CHECK-NEXT: vslideup.vi v10, v11, 2 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vslideup.vi v8, v10, 4 ; CHECK-NEXT: csrr a0, vlenb @@ -287,19 +287,19 @@ define <8 x i32> @vector_interleave4_v8i32_v2i32(<2 x i32> %a, <2 x i32> %b, <2 ; ZVBB-NEXT: sub sp, sp, a0 ; ZVBB-NEXT: addi a0, sp, 16 ; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: vsetvli a2, zero, e32, mf2, ta, ma +; ZVBB-NEXT: vsseg4e32.v v8, (a0) ; ZVBB-NEXT: srli a1, a1, 1 ; ZVBB-NEXT: add a2, a0, a1 -; ZVBB-NEXT: vsetvli a3, zero, e32, mf2, ta, ma -; ZVBB-NEXT: vsseg4e32.v v8, (a0) -; ZVBB-NEXT: add a3, a2, a1 -; ZVBB-NEXT: add a1, a3, a1 -; ZVBB-NEXT: vle32.v v10, (a3) +; ZVBB-NEXT: vle32.v v8, (a0) +; ZVBB-NEXT: add a0, a2, a1 ; ZVBB-NEXT: vle32.v v9, (a2) +; ZVBB-NEXT: add a1, a0, a1 +; ZVBB-NEXT: vle32.v v10, (a0) ; ZVBB-NEXT: vle32.v v11, (a1) -; ZVBB-NEXT: vle32.v v8, (a0) ; ZVBB-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; ZVBB-NEXT: vslideup.vi v10, v11, 2 ; ZVBB-NEXT: vslideup.vi v8, v9, 2 +; ZVBB-NEXT: vslideup.vi v10, v11, 2 ; ZVBB-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; ZVBB-NEXT: vslideup.vi v8, v10, 4 ; ZVBB-NEXT: csrr a0, vlenb @@ -316,19 +316,19 @@ define <8 x i32> @vector_interleave4_v8i32_v2i32(<2 x i32> %a, <2 x i32> %b, <2 ; ZIP-NEXT: sub sp, sp, a0 ; ZIP-NEXT: addi a0, sp, 16 ; ZIP-NEXT: csrr a1, vlenb +; ZIP-NEXT: vsetvli a2, zero, e32, mf2, ta, ma +; ZIP-NEXT: vsseg4e32.v v8, (a0) ; ZIP-NEXT: srli a1, a1, 1 ; ZIP-NEXT: add a2, a0, a1 -; ZIP-NEXT: vsetvli a3, zero, e32, mf2, ta, ma -; ZIP-NEXT: vsseg4e32.v v8, (a0) -; ZIP-NEXT: add a3, a2, a1 -; ZIP-NEXT: add a1, a3, a1 -; ZIP-NEXT: vle32.v v10, (a3) +; ZIP-NEXT: vle32.v v8, (a0) +; ZIP-NEXT: add a0, a2, a1 ; ZIP-NEXT: vle32.v v9, (a2) +; ZIP-NEXT: add a1, a0, a1 +; ZIP-NEXT: vle32.v v10, (a0) ; ZIP-NEXT: vle32.v v11, (a1) -; ZIP-NEXT: vle32.v v8, (a0) ; ZIP-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; ZIP-NEXT: vslideup.vi v10, v11, 2 ; ZIP-NEXT: vslideup.vi v8, v9, 2 +; ZIP-NEXT: vslideup.vi v10, v11, 2 ; ZIP-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; ZIP-NEXT: vslideup.vi v8, v10, 4 ; ZIP-NEXT: csrr a0, vlenb @@ -349,23 +349,23 @@ define <10 x i16> @vector_interleave5_v10i16_v2i16(<2 x i16> %a, <2 x i16> %b, < ; CHECK-NEXT: sub sp, sp, a0 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; CHECK-NEXT: vsseg5e16.v v8, (a0) ; CHECK-NEXT: srli a1, a1, 2 ; CHECK-NEXT: add a2, a0, a1 -; CHECK-NEXT: add a3, a2, a1 -; CHECK-NEXT: vsetvli a4, zero, e16, mf4, ta, ma -; CHECK-NEXT: vsseg5e16.v v8, (a0) -; CHECK-NEXT: add a4, a3, a1 -; CHECK-NEXT: vle16.v v9, (a2) -; CHECK-NEXT: vle16.v v11, (a4) -; CHECK-NEXT: vle16.v v12, (a3) ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: add a1, a4, a1 +; CHECK-NEXT: add a0, a2, a1 +; CHECK-NEXT: vle16.v v9, (a2) +; CHECK-NEXT: add a2, a0, a1 +; CHECK-NEXT: vle16.v v11, (a0) +; CHECK-NEXT: vle16.v v12, (a2) +; CHECK-NEXT: add a1, a2, a1 ; CHECK-NEXT: vle16.v v10, (a1) ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vslideup.vi v12, v11, 2 ; CHECK-NEXT: vslideup.vi v8, v9, 2 +; CHECK-NEXT: vslideup.vi v11, v12, 2 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vslideup.vi v8, v12, 4 +; CHECK-NEXT: vslideup.vi v8, v11, 4 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vslideup.vi v8, v10, 8 ; CHECK-NEXT: csrr a0, vlenb @@ -382,23 +382,23 @@ define <10 x i16> @vector_interleave5_v10i16_v2i16(<2 x i16> %a, <2 x i16> %b, < ; ZVBB-NEXT: sub sp, sp, a0 ; ZVBB-NEXT: addi a0, sp, 16 ; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; ZVBB-NEXT: vsseg5e16.v v8, (a0) ; ZVBB-NEXT: srli a1, a1, 2 ; ZVBB-NEXT: add a2, a0, a1 -; ZVBB-NEXT: add a3, a2, a1 -; ZVBB-NEXT: vsetvli a4, zero, e16, mf4, ta, ma -; ZVBB-NEXT: vsseg5e16.v v8, (a0) -; ZVBB-NEXT: add a4, a3, a1 -; ZVBB-NEXT: vle16.v v9, (a2) -; ZVBB-NEXT: vle16.v v11, (a4) -; ZVBB-NEXT: vle16.v v12, (a3) ; ZVBB-NEXT: vle16.v v8, (a0) -; ZVBB-NEXT: add a1, a4, a1 +; ZVBB-NEXT: add a0, a2, a1 +; ZVBB-NEXT: vle16.v v9, (a2) +; ZVBB-NEXT: add a2, a0, a1 +; ZVBB-NEXT: vle16.v v11, (a0) +; ZVBB-NEXT: vle16.v v12, (a2) +; ZVBB-NEXT: add a1, a2, a1 ; ZVBB-NEXT: vle16.v v10, (a1) ; ZVBB-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZVBB-NEXT: vslideup.vi v12, v11, 2 ; ZVBB-NEXT: vslideup.vi v8, v9, 2 +; ZVBB-NEXT: vslideup.vi v11, v12, 2 ; ZVBB-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVBB-NEXT: vslideup.vi v8, v12, 4 +; ZVBB-NEXT: vslideup.vi v8, v11, 4 ; ZVBB-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; ZVBB-NEXT: vslideup.vi v8, v10, 8 ; ZVBB-NEXT: csrr a0, vlenb @@ -415,23 +415,23 @@ define <10 x i16> @vector_interleave5_v10i16_v2i16(<2 x i16> %a, <2 x i16> %b, < ; ZIP-NEXT: sub sp, sp, a0 ; ZIP-NEXT: addi a0, sp, 16 ; ZIP-NEXT: csrr a1, vlenb +; ZIP-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; ZIP-NEXT: vsseg5e16.v v8, (a0) ; ZIP-NEXT: srli a1, a1, 2 ; ZIP-NEXT: add a2, a0, a1 -; ZIP-NEXT: add a3, a2, a1 -; ZIP-NEXT: vsetvli a4, zero, e16, mf4, ta, ma -; ZIP-NEXT: vsseg5e16.v v8, (a0) -; ZIP-NEXT: add a4, a3, a1 -; ZIP-NEXT: vle16.v v9, (a2) -; ZIP-NEXT: vle16.v v11, (a4) -; ZIP-NEXT: vle16.v v12, (a3) ; ZIP-NEXT: vle16.v v8, (a0) -; ZIP-NEXT: add a1, a4, a1 +; ZIP-NEXT: add a0, a2, a1 +; ZIP-NEXT: vle16.v v9, (a2) +; ZIP-NEXT: add a2, a0, a1 +; ZIP-NEXT: vle16.v v11, (a0) +; ZIP-NEXT: vle16.v v12, (a2) +; ZIP-NEXT: add a1, a2, a1 ; ZIP-NEXT: vle16.v v10, (a1) ; ZIP-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZIP-NEXT: vslideup.vi v12, v11, 2 ; ZIP-NEXT: vslideup.vi v8, v9, 2 +; ZIP-NEXT: vslideup.vi v11, v12, 2 ; ZIP-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZIP-NEXT: vslideup.vi v8, v12, 4 +; ZIP-NEXT: vslideup.vi v8, v11, 4 ; ZIP-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; ZIP-NEXT: vslideup.vi v8, v10, 8 ; ZIP-NEXT: csrr a0, vlenb @@ -452,26 +452,27 @@ define <12 x i16> @vector_interleave6_v12i16_v2i16(<2 x i16> %a, <2 x i16> %b, < ; CHECK-NEXT: sub sp, sp, a0 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; CHECK-NEXT: vsseg6e16.v v8, (a0) ; CHECK-NEXT: srli a1, a1, 2 ; CHECK-NEXT: add a2, a0, a1 -; CHECK-NEXT: add a3, a2, a1 -; CHECK-NEXT: vsetvli a4, zero, e16, mf4, ta, ma -; CHECK-NEXT: vsseg6e16.v v8, (a0) +; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: vle16.v v9, (a2) -; CHECK-NEXT: add a2, a3, a1 -; CHECK-NEXT: vle16.v v11, (a2) ; CHECK-NEXT: add a2, a2, a1 -; CHECK-NEXT: vle16.v v12, (a3) -; CHECK-NEXT: add a1, a2, a1 -; CHECK-NEXT: vle16.v v10, (a2) -; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: add a0, a2, a1 +; CHECK-NEXT: vle16.v v11, (a2) +; CHECK-NEXT: vle16.v v12, (a0) +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: add a1, a0, a1 +; CHECK-NEXT: vle16.v v10, (a0) ; CHECK-NEXT: vle16.v v13, (a1) ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vslideup.vi v12, v11, 2 ; CHECK-NEXT: vslideup.vi v8, v9, 2 -; CHECK-NEXT: vslideup.vi v10, v13, 2 +; CHECK-NEXT: vslideup.vi v11, v12, 2 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vslideup.vi v8, v12, 4 +; CHECK-NEXT: vslideup.vi v8, v11, 4 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vslideup.vi v10, v13, 2 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vslideup.vi v8, v10, 8 ; CHECK-NEXT: csrr a0, vlenb @@ -488,26 +489,27 @@ define <12 x i16> @vector_interleave6_v12i16_v2i16(<2 x i16> %a, <2 x i16> %b, < ; ZVBB-NEXT: sub sp, sp, a0 ; ZVBB-NEXT: addi a0, sp, 16 ; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; ZVBB-NEXT: vsseg6e16.v v8, (a0) ; ZVBB-NEXT: srli a1, a1, 2 ; ZVBB-NEXT: add a2, a0, a1 -; ZVBB-NEXT: add a3, a2, a1 -; ZVBB-NEXT: vsetvli a4, zero, e16, mf4, ta, ma -; ZVBB-NEXT: vsseg6e16.v v8, (a0) +; ZVBB-NEXT: vle16.v v8, (a0) ; ZVBB-NEXT: vle16.v v9, (a2) -; ZVBB-NEXT: add a2, a3, a1 -; ZVBB-NEXT: vle16.v v11, (a2) ; ZVBB-NEXT: add a2, a2, a1 -; ZVBB-NEXT: vle16.v v12, (a3) -; ZVBB-NEXT: add a1, a2, a1 -; ZVBB-NEXT: vle16.v v10, (a2) -; ZVBB-NEXT: vle16.v v8, (a0) +; ZVBB-NEXT: add a0, a2, a1 +; ZVBB-NEXT: vle16.v v11, (a2) +; ZVBB-NEXT: vle16.v v12, (a0) +; ZVBB-NEXT: add a0, a0, a1 +; ZVBB-NEXT: add a1, a0, a1 +; ZVBB-NEXT: vle16.v v10, (a0) ; ZVBB-NEXT: vle16.v v13, (a1) ; ZVBB-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZVBB-NEXT: vslideup.vi v12, v11, 2 ; ZVBB-NEXT: vslideup.vi v8, v9, 2 -; ZVBB-NEXT: vslideup.vi v10, v13, 2 +; ZVBB-NEXT: vslideup.vi v11, v12, 2 ; ZVBB-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVBB-NEXT: vslideup.vi v8, v12, 4 +; ZVBB-NEXT: vslideup.vi v8, v11, 4 +; ZVBB-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; ZVBB-NEXT: vslideup.vi v10, v13, 2 ; ZVBB-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; ZVBB-NEXT: vslideup.vi v8, v10, 8 ; ZVBB-NEXT: csrr a0, vlenb @@ -524,26 +526,27 @@ define <12 x i16> @vector_interleave6_v12i16_v2i16(<2 x i16> %a, <2 x i16> %b, < ; ZIP-NEXT: sub sp, sp, a0 ; ZIP-NEXT: addi a0, sp, 16 ; ZIP-NEXT: csrr a1, vlenb +; ZIP-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; ZIP-NEXT: vsseg6e16.v v8, (a0) ; ZIP-NEXT: srli a1, a1, 2 ; ZIP-NEXT: add a2, a0, a1 -; ZIP-NEXT: add a3, a2, a1 -; ZIP-NEXT: vsetvli a4, zero, e16, mf4, ta, ma -; ZIP-NEXT: vsseg6e16.v v8, (a0) +; ZIP-NEXT: vle16.v v8, (a0) ; ZIP-NEXT: vle16.v v9, (a2) -; ZIP-NEXT: add a2, a3, a1 -; ZIP-NEXT: vle16.v v11, (a2) ; ZIP-NEXT: add a2, a2, a1 -; ZIP-NEXT: vle16.v v12, (a3) -; ZIP-NEXT: add a1, a2, a1 -; ZIP-NEXT: vle16.v v10, (a2) -; ZIP-NEXT: vle16.v v8, (a0) +; ZIP-NEXT: add a0, a2, a1 +; ZIP-NEXT: vle16.v v11, (a2) +; ZIP-NEXT: vle16.v v12, (a0) +; ZIP-NEXT: add a0, a0, a1 +; ZIP-NEXT: add a1, a0, a1 +; ZIP-NEXT: vle16.v v10, (a0) ; ZIP-NEXT: vle16.v v13, (a1) ; ZIP-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZIP-NEXT: vslideup.vi v12, v11, 2 ; ZIP-NEXT: vslideup.vi v8, v9, 2 -; ZIP-NEXT: vslideup.vi v10, v13, 2 +; ZIP-NEXT: vslideup.vi v11, v12, 2 ; ZIP-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZIP-NEXT: vslideup.vi v8, v12, 4 +; ZIP-NEXT: vslideup.vi v8, v11, 4 +; ZIP-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; ZIP-NEXT: vslideup.vi v10, v13, 2 ; ZIP-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; ZIP-NEXT: vslideup.vi v8, v10, 8 ; ZIP-NEXT: csrr a0, vlenb @@ -563,30 +566,32 @@ define <14 x i8> @vector_interleave7_v14i8_v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i ; CHECK-NEXT: sub sp, sp, a0 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: vsetvli a2, zero, e8, mf8, ta, ma +; CHECK-NEXT: vsseg7e8.v v8, (a0) ; CHECK-NEXT: srli a1, a1, 3 ; CHECK-NEXT: add a2, a0, a1 -; CHECK-NEXT: add a3, a2, a1 -; CHECK-NEXT: add a4, a3, a1 -; CHECK-NEXT: vsetvli a5, zero, e8, mf8, ta, ma -; CHECK-NEXT: vsseg7e8.v v8, (a0) -; CHECK-NEXT: vle8.v v9, (a4) -; CHECK-NEXT: add a4, a4, a1 -; CHECK-NEXT: vle8.v v10, (a2) -; CHECK-NEXT: add a2, a4, a1 -; CHECK-NEXT: add a1, a2, a1 -; CHECK-NEXT: vle8.v v11, (a2) -; CHECK-NEXT: vle8.v v12, (a4) ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vle8.v v13, (a1) -; CHECK-NEXT: vle8.v v14, (a3) +; CHECK-NEXT: add a0, a2, a1 +; CHECK-NEXT: vle8.v v9, (a2) +; CHECK-NEXT: add a2, a0, a1 +; CHECK-NEXT: vle8.v v10, (a0) +; CHECK-NEXT: vle8.v v11, (a2) +; CHECK-NEXT: add a2, a2, a1 +; CHECK-NEXT: add a0, a2, a1 +; CHECK-NEXT: vle8.v v12, (a2) +; CHECK-NEXT: add a1, a0, a1 +; CHECK-NEXT: vle8.v v13, (a0) +; CHECK-NEXT: vle8.v v14, (a1) ; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma -; CHECK-NEXT: vslideup.vi v12, v11, 2 -; CHECK-NEXT: vslideup.vi v8, v10, 2 +; CHECK-NEXT: vslideup.vi v8, v9, 2 ; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, ma -; CHECK-NEXT: vslideup.vi v12, v13, 4 -; CHECK-NEXT: vslideup.vi v8, v14, 4 +; CHECK-NEXT: vslideup.vi v8, v10, 4 +; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma +; CHECK-NEXT: vslideup.vi v12, v13, 2 +; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, ma +; CHECK-NEXT: vslideup.vi v12, v14, 4 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vslideup.vi v8, v9, 6 +; CHECK-NEXT: vslideup.vi v8, v11, 6 ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: vslideup.vi v8, v12, 8 ; CHECK-NEXT: csrr a0, vlenb @@ -601,30 +606,32 @@ define <14 x i8> @vector_interleave7_v14i8_v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i ; ZVBB-NEXT: sub sp, sp, a0 ; ZVBB-NEXT: addi a0, sp, 16 ; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: vsetvli a2, zero, e8, mf8, ta, ma +; ZVBB-NEXT: vsseg7e8.v v8, (a0) ; ZVBB-NEXT: srli a1, a1, 3 ; ZVBB-NEXT: add a2, a0, a1 -; ZVBB-NEXT: add a3, a2, a1 -; ZVBB-NEXT: add a4, a3, a1 -; ZVBB-NEXT: vsetvli a5, zero, e8, mf8, ta, ma -; ZVBB-NEXT: vsseg7e8.v v8, (a0) -; ZVBB-NEXT: vle8.v v9, (a4) -; ZVBB-NEXT: add a4, a4, a1 -; ZVBB-NEXT: vle8.v v10, (a2) -; ZVBB-NEXT: add a2, a4, a1 -; ZVBB-NEXT: add a1, a2, a1 -; ZVBB-NEXT: vle8.v v11, (a2) -; ZVBB-NEXT: vle8.v v12, (a4) ; ZVBB-NEXT: vle8.v v8, (a0) -; ZVBB-NEXT: vle8.v v13, (a1) -; ZVBB-NEXT: vle8.v v14, (a3) +; ZVBB-NEXT: add a0, a2, a1 +; ZVBB-NEXT: vle8.v v9, (a2) +; ZVBB-NEXT: add a2, a0, a1 +; ZVBB-NEXT: vle8.v v10, (a0) +; ZVBB-NEXT: vle8.v v11, (a2) +; ZVBB-NEXT: add a2, a2, a1 +; ZVBB-NEXT: add a0, a2, a1 +; ZVBB-NEXT: vle8.v v12, (a2) +; ZVBB-NEXT: add a1, a0, a1 +; ZVBB-NEXT: vle8.v v13, (a0) +; ZVBB-NEXT: vle8.v v14, (a1) ; ZVBB-NEXT: vsetivli zero, 4, e8, mf2, tu, ma -; ZVBB-NEXT: vslideup.vi v12, v11, 2 -; ZVBB-NEXT: vslideup.vi v8, v10, 2 +; ZVBB-NEXT: vslideup.vi v8, v9, 2 +; ZVBB-NEXT: vsetivli zero, 6, e8, mf2, tu, ma +; ZVBB-NEXT: vslideup.vi v8, v10, 4 +; ZVBB-NEXT: vsetivli zero, 4, e8, mf2, tu, ma +; ZVBB-NEXT: vslideup.vi v12, v13, 2 ; ZVBB-NEXT: vsetivli zero, 6, e8, mf2, tu, ma -; ZVBB-NEXT: vslideup.vi v12, v13, 4 -; ZVBB-NEXT: vslideup.vi v8, v14, 4 +; ZVBB-NEXT: vslideup.vi v12, v14, 4 ; ZVBB-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; ZVBB-NEXT: vslideup.vi v8, v9, 6 +; ZVBB-NEXT: vslideup.vi v8, v11, 6 ; ZVBB-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; ZVBB-NEXT: vslideup.vi v8, v12, 8 ; ZVBB-NEXT: csrr a0, vlenb @@ -639,30 +646,32 @@ define <14 x i8> @vector_interleave7_v14i8_v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i ; ZIP-NEXT: sub sp, sp, a0 ; ZIP-NEXT: addi a0, sp, 16 ; ZIP-NEXT: csrr a1, vlenb +; ZIP-NEXT: vsetvli a2, zero, e8, mf8, ta, ma +; ZIP-NEXT: vsseg7e8.v v8, (a0) ; ZIP-NEXT: srli a1, a1, 3 ; ZIP-NEXT: add a2, a0, a1 -; ZIP-NEXT: add a3, a2, a1 -; ZIP-NEXT: add a4, a3, a1 -; ZIP-NEXT: vsetvli a5, zero, e8, mf8, ta, ma -; ZIP-NEXT: vsseg7e8.v v8, (a0) -; ZIP-NEXT: vle8.v v9, (a4) -; ZIP-NEXT: add a4, a4, a1 -; ZIP-NEXT: vle8.v v10, (a2) -; ZIP-NEXT: add a2, a4, a1 -; ZIP-NEXT: add a1, a2, a1 -; ZIP-NEXT: vle8.v v11, (a2) -; ZIP-NEXT: vle8.v v12, (a4) ; ZIP-NEXT: vle8.v v8, (a0) -; ZIP-NEXT: vle8.v v13, (a1) -; ZIP-NEXT: vle8.v v14, (a3) +; ZIP-NEXT: add a0, a2, a1 +; ZIP-NEXT: vle8.v v9, (a2) +; ZIP-NEXT: add a2, a0, a1 +; ZIP-NEXT: vle8.v v10, (a0) +; ZIP-NEXT: vle8.v v11, (a2) +; ZIP-NEXT: add a2, a2, a1 +; ZIP-NEXT: add a0, a2, a1 +; ZIP-NEXT: vle8.v v12, (a2) +; ZIP-NEXT: add a1, a0, a1 +; ZIP-NEXT: vle8.v v13, (a0) +; ZIP-NEXT: vle8.v v14, (a1) ; ZIP-NEXT: vsetivli zero, 4, e8, mf2, tu, ma -; ZIP-NEXT: vslideup.vi v12, v11, 2 -; ZIP-NEXT: vslideup.vi v8, v10, 2 +; ZIP-NEXT: vslideup.vi v8, v9, 2 ; ZIP-NEXT: vsetivli zero, 6, e8, mf2, tu, ma -; ZIP-NEXT: vslideup.vi v12, v13, 4 -; ZIP-NEXT: vslideup.vi v8, v14, 4 +; ZIP-NEXT: vslideup.vi v8, v10, 4 +; ZIP-NEXT: vsetivli zero, 4, e8, mf2, tu, ma +; ZIP-NEXT: vslideup.vi v12, v13, 2 +; ZIP-NEXT: vsetivli zero, 6, e8, mf2, tu, ma +; ZIP-NEXT: vslideup.vi v12, v14, 4 ; ZIP-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; ZIP-NEXT: vslideup.vi v8, v9, 6 +; ZIP-NEXT: vslideup.vi v8, v11, 6 ; ZIP-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; ZIP-NEXT: vslideup.vi v8, v12, 8 ; ZIP-NEXT: csrr a0, vlenb @@ -681,35 +690,38 @@ define <16 x i8> @vector_interleave8_v16i8_v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i ; CHECK-NEXT: sub sp, sp, a0 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: vsetvli a2, zero, e8, mf8, ta, ma +; CHECK-NEXT: vsseg8e8.v v8, (a0) ; CHECK-NEXT: srli a1, a1, 3 ; CHECK-NEXT: add a2, a0, a1 -; CHECK-NEXT: add a3, a2, a1 -; CHECK-NEXT: add a4, a3, a1 -; CHECK-NEXT: add a5, a4, a1 -; CHECK-NEXT: add a6, a5, a1 -; CHECK-NEXT: vsetvli a7, zero, e8, mf8, ta, ma -; CHECK-NEXT: vsseg8e8.v v8, (a0) -; CHECK-NEXT: vle8.v v9, (a6) -; CHECK-NEXT: add a6, a6, a1 -; CHECK-NEXT: vle8.v v10, (a5) -; CHECK-NEXT: vle8.v v11, (a6) -; CHECK-NEXT: add a1, a6, a1 -; CHECK-NEXT: vle8.v v12, (a2) ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vle8.v v13, (a3) -; CHECK-NEXT: vle8.v v14, (a4) -; CHECK-NEXT: vle8.v v15, (a1) +; CHECK-NEXT: add a0, a2, a1 +; CHECK-NEXT: vle8.v v9, (a2) +; CHECK-NEXT: add a2, a0, a1 +; CHECK-NEXT: vle8.v v10, (a0) +; CHECK-NEXT: add a0, a2, a1 +; CHECK-NEXT: vle8.v v11, (a2) +; CHECK-NEXT: add a2, a0, a1 +; CHECK-NEXT: vle8.v v12, (a0) +; CHECK-NEXT: vle8.v v13, (a2) +; CHECK-NEXT: add a2, a2, a1 +; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma +; CHECK-NEXT: vslideup.vi v8, v9, 2 +; CHECK-NEXT: vsetvli a0, zero, e8, mf8, ta, ma +; CHECK-NEXT: vle8.v v9, (a2) +; CHECK-NEXT: add a1, a2, a1 +; CHECK-NEXT: vle8.v v14, (a1) +; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, ma +; CHECK-NEXT: vslideup.vi v8, v10, 4 ; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma -; CHECK-NEXT: vslideup.vi v10, v9, 2 -; CHECK-NEXT: vslideup.vi v8, v12, 2 +; CHECK-NEXT: vslideup.vi v12, v13, 2 ; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, ma -; CHECK-NEXT: vslideup.vi v10, v11, 4 -; CHECK-NEXT: vslideup.vi v8, v13, 4 +; CHECK-NEXT: vslideup.vi v12, v9, 4 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vslideup.vi v10, v15, 6 -; CHECK-NEXT: vslideup.vi v8, v14, 6 +; CHECK-NEXT: vslideup.vi v12, v14, 6 +; CHECK-NEXT: vslideup.vi v8, v11, 6 ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vslideup.vi v8, v10, 8 +; CHECK-NEXT: vslideup.vi v8, v12, 8 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 @@ -722,35 +734,38 @@ define <16 x i8> @vector_interleave8_v16i8_v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i ; ZVBB-NEXT: sub sp, sp, a0 ; ZVBB-NEXT: addi a0, sp, 16 ; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: vsetvli a2, zero, e8, mf8, ta, ma +; ZVBB-NEXT: vsseg8e8.v v8, (a0) ; ZVBB-NEXT: srli a1, a1, 3 ; ZVBB-NEXT: add a2, a0, a1 -; ZVBB-NEXT: add a3, a2, a1 -; ZVBB-NEXT: add a4, a3, a1 -; ZVBB-NEXT: add a5, a4, a1 -; ZVBB-NEXT: add a6, a5, a1 -; ZVBB-NEXT: vsetvli a7, zero, e8, mf8, ta, ma -; ZVBB-NEXT: vsseg8e8.v v8, (a0) -; ZVBB-NEXT: vle8.v v9, (a6) -; ZVBB-NEXT: add a6, a6, a1 -; ZVBB-NEXT: vle8.v v10, (a5) -; ZVBB-NEXT: vle8.v v11, (a6) -; ZVBB-NEXT: add a1, a6, a1 -; ZVBB-NEXT: vle8.v v12, (a2) ; ZVBB-NEXT: vle8.v v8, (a0) -; ZVBB-NEXT: vle8.v v13, (a3) -; ZVBB-NEXT: vle8.v v14, (a4) -; ZVBB-NEXT: vle8.v v15, (a1) +; ZVBB-NEXT: add a0, a2, a1 +; ZVBB-NEXT: vle8.v v9, (a2) +; ZVBB-NEXT: add a2, a0, a1 +; ZVBB-NEXT: vle8.v v10, (a0) +; ZVBB-NEXT: add a0, a2, a1 +; ZVBB-NEXT: vle8.v v11, (a2) +; ZVBB-NEXT: add a2, a0, a1 +; ZVBB-NEXT: vle8.v v12, (a0) +; ZVBB-NEXT: vle8.v v13, (a2) +; ZVBB-NEXT: add a2, a2, a1 ; ZVBB-NEXT: vsetivli zero, 4, e8, mf2, tu, ma -; ZVBB-NEXT: vslideup.vi v10, v9, 2 -; ZVBB-NEXT: vslideup.vi v8, v12, 2 +; ZVBB-NEXT: vslideup.vi v8, v9, 2 +; ZVBB-NEXT: vsetvli a0, zero, e8, mf8, ta, ma +; ZVBB-NEXT: vle8.v v9, (a2) +; ZVBB-NEXT: add a1, a2, a1 +; ZVBB-NEXT: vle8.v v14, (a1) ; ZVBB-NEXT: vsetivli zero, 6, e8, mf2, tu, ma -; ZVBB-NEXT: vslideup.vi v10, v11, 4 -; ZVBB-NEXT: vslideup.vi v8, v13, 4 +; ZVBB-NEXT: vslideup.vi v8, v10, 4 +; ZVBB-NEXT: vsetivli zero, 4, e8, mf2, tu, ma +; ZVBB-NEXT: vslideup.vi v12, v13, 2 +; ZVBB-NEXT: vsetivli zero, 6, e8, mf2, tu, ma +; ZVBB-NEXT: vslideup.vi v12, v9, 4 ; ZVBB-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; ZVBB-NEXT: vslideup.vi v10, v15, 6 -; ZVBB-NEXT: vslideup.vi v8, v14, 6 +; ZVBB-NEXT: vslideup.vi v12, v14, 6 +; ZVBB-NEXT: vslideup.vi v8, v11, 6 ; ZVBB-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; ZVBB-NEXT: vslideup.vi v8, v10, 8 +; ZVBB-NEXT: vslideup.vi v8, v12, 8 ; ZVBB-NEXT: csrr a0, vlenb ; ZVBB-NEXT: add sp, sp, a0 ; ZVBB-NEXT: addi sp, sp, 16 @@ -763,35 +778,38 @@ define <16 x i8> @vector_interleave8_v16i8_v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i ; ZIP-NEXT: sub sp, sp, a0 ; ZIP-NEXT: addi a0, sp, 16 ; ZIP-NEXT: csrr a1, vlenb +; ZIP-NEXT: vsetvli a2, zero, e8, mf8, ta, ma +; ZIP-NEXT: vsseg8e8.v v8, (a0) ; ZIP-NEXT: srli a1, a1, 3 ; ZIP-NEXT: add a2, a0, a1 -; ZIP-NEXT: add a3, a2, a1 -; ZIP-NEXT: add a4, a3, a1 -; ZIP-NEXT: add a5, a4, a1 -; ZIP-NEXT: add a6, a5, a1 -; ZIP-NEXT: vsetvli a7, zero, e8, mf8, ta, ma -; ZIP-NEXT: vsseg8e8.v v8, (a0) -; ZIP-NEXT: vle8.v v9, (a6) -; ZIP-NEXT: add a6, a6, a1 -; ZIP-NEXT: vle8.v v10, (a5) -; ZIP-NEXT: vle8.v v11, (a6) -; ZIP-NEXT: add a1, a6, a1 -; ZIP-NEXT: vle8.v v12, (a2) ; ZIP-NEXT: vle8.v v8, (a0) -; ZIP-NEXT: vle8.v v13, (a3) -; ZIP-NEXT: vle8.v v14, (a4) -; ZIP-NEXT: vle8.v v15, (a1) +; ZIP-NEXT: add a0, a2, a1 +; ZIP-NEXT: vle8.v v9, (a2) +; ZIP-NEXT: add a2, a0, a1 +; ZIP-NEXT: vle8.v v10, (a0) +; ZIP-NEXT: add a0, a2, a1 +; ZIP-NEXT: vle8.v v11, (a2) +; ZIP-NEXT: add a2, a0, a1 +; ZIP-NEXT: vle8.v v12, (a0) +; ZIP-NEXT: vle8.v v13, (a2) +; ZIP-NEXT: add a2, a2, a1 ; ZIP-NEXT: vsetivli zero, 4, e8, mf2, tu, ma -; ZIP-NEXT: vslideup.vi v10, v9, 2 -; ZIP-NEXT: vslideup.vi v8, v12, 2 +; ZIP-NEXT: vslideup.vi v8, v9, 2 +; ZIP-NEXT: vsetvli a0, zero, e8, mf8, ta, ma +; ZIP-NEXT: vle8.v v9, (a2) +; ZIP-NEXT: add a1, a2, a1 +; ZIP-NEXT: vle8.v v14, (a1) ; ZIP-NEXT: vsetivli zero, 6, e8, mf2, tu, ma -; ZIP-NEXT: vslideup.vi v10, v11, 4 -; ZIP-NEXT: vslideup.vi v8, v13, 4 +; ZIP-NEXT: vslideup.vi v8, v10, 4 +; ZIP-NEXT: vsetivli zero, 4, e8, mf2, tu, ma +; ZIP-NEXT: vslideup.vi v12, v13, 2 +; ZIP-NEXT: vsetivli zero, 6, e8, mf2, tu, ma +; ZIP-NEXT: vslideup.vi v12, v9, 4 ; ZIP-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; ZIP-NEXT: vslideup.vi v10, v15, 6 -; ZIP-NEXT: vslideup.vi v8, v14, 6 +; ZIP-NEXT: vslideup.vi v12, v14, 6 +; ZIP-NEXT: vslideup.vi v8, v11, 6 ; ZIP-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; ZIP-NEXT: vslideup.vi v8, v10, 8 +; ZIP-NEXT: vslideup.vi v8, v12, 8 ; ZIP-NEXT: csrr a0, vlenb ; ZIP-NEXT: add sp, sp, a0 ; ZIP-NEXT: addi sp, sp, 16 @@ -956,8 +974,8 @@ define <4 x double> @vector_interleave_v4f64_v2f64(<2 x double> %a, <2 x double> ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vmv1r.v v10, v9 ; CHECK-NEXT: lui a0, 12304 -; CHECK-NEXT: addi a0, a0, 512 ; CHECK-NEXT: vslideup.vi v8, v10, 2 +; CHECK-NEXT: addi a0, a0, 512 ; CHECK-NEXT: vmv.s.x v10, a0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vsext.vf2 v12, v10 @@ -971,8 +989,8 @@ define <4 x double> @vector_interleave_v4f64_v2f64(<2 x double> %a, <2 x double> ; ZVBB-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; ZVBB-NEXT: vmv1r.v v10, v9 ; ZVBB-NEXT: lui a0, 12304 -; ZVBB-NEXT: addi a0, a0, 512 ; ZVBB-NEXT: vslideup.vi v8, v10, 2 +; ZVBB-NEXT: addi a0, a0, 512 ; ZVBB-NEXT: vmv.s.x v10, a0 ; ZVBB-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; ZVBB-NEXT: vsext.vf2 v12, v10 @@ -1001,12 +1019,12 @@ define <6 x float> @vector_interleave3_v6f32_v2f32(<2 x float> %a, <2 x float> % ; CHECK-NEXT: sub sp, sp, a0 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: srli a1, a1, 1 ; CHECK-NEXT: vsetvli a2, zero, e32, mf2, ta, ma ; CHECK-NEXT: vsseg3e32.v v8, (a0) +; CHECK-NEXT: srli a1, a1, 1 ; CHECK-NEXT: add a2, a0, a1 -; CHECK-NEXT: vle32.v v9, (a2) ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vle32.v v9, (a2) ; CHECK-NEXT: add a1, a2, a1 ; CHECK-NEXT: vle32.v v10, (a1) ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma @@ -1027,12 +1045,12 @@ define <6 x float> @vector_interleave3_v6f32_v2f32(<2 x float> %a, <2 x float> % ; ZVBB-NEXT: sub sp, sp, a0 ; ZVBB-NEXT: addi a0, sp, 16 ; ZVBB-NEXT: csrr a1, vlenb -; ZVBB-NEXT: srli a1, a1, 1 ; ZVBB-NEXT: vsetvli a2, zero, e32, mf2, ta, ma ; ZVBB-NEXT: vsseg3e32.v v8, (a0) +; ZVBB-NEXT: srli a1, a1, 1 ; ZVBB-NEXT: add a2, a0, a1 -; ZVBB-NEXT: vle32.v v9, (a2) ; ZVBB-NEXT: vle32.v v8, (a0) +; ZVBB-NEXT: vle32.v v9, (a2) ; ZVBB-NEXT: add a1, a2, a1 ; ZVBB-NEXT: vle32.v v10, (a1) ; ZVBB-NEXT: vsetivli zero, 4, e32, m1, ta, ma @@ -1053,12 +1071,12 @@ define <6 x float> @vector_interleave3_v6f32_v2f32(<2 x float> %a, <2 x float> % ; ZIP-NEXT: sub sp, sp, a0 ; ZIP-NEXT: addi a0, sp, 16 ; ZIP-NEXT: csrr a1, vlenb -; ZIP-NEXT: srli a1, a1, 1 ; ZIP-NEXT: vsetvli a2, zero, e32, mf2, ta, ma ; ZIP-NEXT: vsseg3e32.v v8, (a0) +; ZIP-NEXT: srli a1, a1, 1 ; ZIP-NEXT: add a2, a0, a1 -; ZIP-NEXT: vle32.v v9, (a2) ; ZIP-NEXT: vle32.v v8, (a0) +; ZIP-NEXT: vle32.v v9, (a2) ; ZIP-NEXT: add a1, a2, a1 ; ZIP-NEXT: vle32.v v10, (a1) ; ZIP-NEXT: vsetivli zero, 4, e32, m1, ta, ma @@ -1083,19 +1101,19 @@ define <8 x float> @vector_interleave4_v8f32_v2f32(<2 x float> %a, <2 x float> % ; CHECK-NEXT: sub sp, sp, a0 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: vsetvli a2, zero, e32, mf2, ta, ma +; CHECK-NEXT: vsseg4e32.v v8, (a0) ; CHECK-NEXT: srli a1, a1, 1 ; CHECK-NEXT: add a2, a0, a1 -; CHECK-NEXT: vsetvli a3, zero, e32, mf2, ta, ma -; CHECK-NEXT: vsseg4e32.v v8, (a0) -; CHECK-NEXT: add a3, a2, a1 -; CHECK-NEXT: add a1, a3, a1 -; CHECK-NEXT: vle32.v v10, (a3) +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: add a0, a2, a1 ; CHECK-NEXT: vle32.v v9, (a2) +; CHECK-NEXT: add a1, a0, a1 +; CHECK-NEXT: vle32.v v10, (a0) ; CHECK-NEXT: vle32.v v11, (a1) -; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vslideup.vi v10, v11, 2 ; CHECK-NEXT: vslideup.vi v8, v9, 2 +; CHECK-NEXT: vslideup.vi v10, v11, 2 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vslideup.vi v8, v10, 4 ; CHECK-NEXT: csrr a0, vlenb @@ -1112,19 +1130,19 @@ define <8 x float> @vector_interleave4_v8f32_v2f32(<2 x float> %a, <2 x float> % ; ZVBB-NEXT: sub sp, sp, a0 ; ZVBB-NEXT: addi a0, sp, 16 ; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: vsetvli a2, zero, e32, mf2, ta, ma +; ZVBB-NEXT: vsseg4e32.v v8, (a0) ; ZVBB-NEXT: srli a1, a1, 1 ; ZVBB-NEXT: add a2, a0, a1 -; ZVBB-NEXT: vsetvli a3, zero, e32, mf2, ta, ma -; ZVBB-NEXT: vsseg4e32.v v8, (a0) -; ZVBB-NEXT: add a3, a2, a1 -; ZVBB-NEXT: add a1, a3, a1 -; ZVBB-NEXT: vle32.v v10, (a3) +; ZVBB-NEXT: vle32.v v8, (a0) +; ZVBB-NEXT: add a0, a2, a1 ; ZVBB-NEXT: vle32.v v9, (a2) +; ZVBB-NEXT: add a1, a0, a1 +; ZVBB-NEXT: vle32.v v10, (a0) ; ZVBB-NEXT: vle32.v v11, (a1) -; ZVBB-NEXT: vle32.v v8, (a0) ; ZVBB-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; ZVBB-NEXT: vslideup.vi v10, v11, 2 ; ZVBB-NEXT: vslideup.vi v8, v9, 2 +; ZVBB-NEXT: vslideup.vi v10, v11, 2 ; ZVBB-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; ZVBB-NEXT: vslideup.vi v8, v10, 4 ; ZVBB-NEXT: csrr a0, vlenb @@ -1141,19 +1159,19 @@ define <8 x float> @vector_interleave4_v8f32_v2f32(<2 x float> %a, <2 x float> % ; ZIP-NEXT: sub sp, sp, a0 ; ZIP-NEXT: addi a0, sp, 16 ; ZIP-NEXT: csrr a1, vlenb +; ZIP-NEXT: vsetvli a2, zero, e32, mf2, ta, ma +; ZIP-NEXT: vsseg4e32.v v8, (a0) ; ZIP-NEXT: srli a1, a1, 1 ; ZIP-NEXT: add a2, a0, a1 -; ZIP-NEXT: vsetvli a3, zero, e32, mf2, ta, ma -; ZIP-NEXT: vsseg4e32.v v8, (a0) -; ZIP-NEXT: add a3, a2, a1 -; ZIP-NEXT: add a1, a3, a1 -; ZIP-NEXT: vle32.v v10, (a3) +; ZIP-NEXT: vle32.v v8, (a0) +; ZIP-NEXT: add a0, a2, a1 ; ZIP-NEXT: vle32.v v9, (a2) +; ZIP-NEXT: add a1, a0, a1 +; ZIP-NEXT: vle32.v v10, (a0) ; ZIP-NEXT: vle32.v v11, (a1) -; ZIP-NEXT: vle32.v v8, (a0) ; ZIP-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; ZIP-NEXT: vslideup.vi v10, v11, 2 ; ZIP-NEXT: vslideup.vi v8, v9, 2 +; ZIP-NEXT: vslideup.vi v10, v11, 2 ; ZIP-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; ZIP-NEXT: vslideup.vi v8, v10, 4 ; ZIP-NEXT: csrr a0, vlenb @@ -1174,23 +1192,23 @@ define <10 x half> @vector_interleave5_v10f16_v2f16(<2 x half> %a, <2 x half> %b ; CHECK-NEXT: sub sp, sp, a0 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; CHECK-NEXT: vsseg5e16.v v8, (a0) ; CHECK-NEXT: srli a1, a1, 2 ; CHECK-NEXT: add a2, a0, a1 -; CHECK-NEXT: add a3, a2, a1 -; CHECK-NEXT: vsetvli a4, zero, e16, mf4, ta, ma -; CHECK-NEXT: vsseg5e16.v v8, (a0) -; CHECK-NEXT: add a4, a3, a1 -; CHECK-NEXT: vle16.v v9, (a2) -; CHECK-NEXT: vle16.v v11, (a4) -; CHECK-NEXT: vle16.v v12, (a3) ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: add a1, a4, a1 +; CHECK-NEXT: add a0, a2, a1 +; CHECK-NEXT: vle16.v v9, (a2) +; CHECK-NEXT: add a2, a0, a1 +; CHECK-NEXT: vle16.v v11, (a0) +; CHECK-NEXT: vle16.v v12, (a2) +; CHECK-NEXT: add a1, a2, a1 ; CHECK-NEXT: vle16.v v10, (a1) ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vslideup.vi v12, v11, 2 ; CHECK-NEXT: vslideup.vi v8, v9, 2 +; CHECK-NEXT: vslideup.vi v11, v12, 2 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vslideup.vi v8, v12, 4 +; CHECK-NEXT: vslideup.vi v8, v11, 4 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vslideup.vi v8, v10, 8 ; CHECK-NEXT: csrr a0, vlenb @@ -1207,23 +1225,23 @@ define <10 x half> @vector_interleave5_v10f16_v2f16(<2 x half> %a, <2 x half> %b ; ZVBB-NEXT: sub sp, sp, a0 ; ZVBB-NEXT: addi a0, sp, 16 ; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; ZVBB-NEXT: vsseg5e16.v v8, (a0) ; ZVBB-NEXT: srli a1, a1, 2 ; ZVBB-NEXT: add a2, a0, a1 -; ZVBB-NEXT: add a3, a2, a1 -; ZVBB-NEXT: vsetvli a4, zero, e16, mf4, ta, ma -; ZVBB-NEXT: vsseg5e16.v v8, (a0) -; ZVBB-NEXT: add a4, a3, a1 -; ZVBB-NEXT: vle16.v v9, (a2) -; ZVBB-NEXT: vle16.v v11, (a4) -; ZVBB-NEXT: vle16.v v12, (a3) ; ZVBB-NEXT: vle16.v v8, (a0) -; ZVBB-NEXT: add a1, a4, a1 +; ZVBB-NEXT: add a0, a2, a1 +; ZVBB-NEXT: vle16.v v9, (a2) +; ZVBB-NEXT: add a2, a0, a1 +; ZVBB-NEXT: vle16.v v11, (a0) +; ZVBB-NEXT: vle16.v v12, (a2) +; ZVBB-NEXT: add a1, a2, a1 ; ZVBB-NEXT: vle16.v v10, (a1) ; ZVBB-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZVBB-NEXT: vslideup.vi v12, v11, 2 ; ZVBB-NEXT: vslideup.vi v8, v9, 2 +; ZVBB-NEXT: vslideup.vi v11, v12, 2 ; ZVBB-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVBB-NEXT: vslideup.vi v8, v12, 4 +; ZVBB-NEXT: vslideup.vi v8, v11, 4 ; ZVBB-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; ZVBB-NEXT: vslideup.vi v8, v10, 8 ; ZVBB-NEXT: csrr a0, vlenb @@ -1240,23 +1258,23 @@ define <10 x half> @vector_interleave5_v10f16_v2f16(<2 x half> %a, <2 x half> %b ; ZIP-NEXT: sub sp, sp, a0 ; ZIP-NEXT: addi a0, sp, 16 ; ZIP-NEXT: csrr a1, vlenb +; ZIP-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; ZIP-NEXT: vsseg5e16.v v8, (a0) ; ZIP-NEXT: srli a1, a1, 2 ; ZIP-NEXT: add a2, a0, a1 -; ZIP-NEXT: add a3, a2, a1 -; ZIP-NEXT: vsetvli a4, zero, e16, mf4, ta, ma -; ZIP-NEXT: vsseg5e16.v v8, (a0) -; ZIP-NEXT: add a4, a3, a1 -; ZIP-NEXT: vle16.v v9, (a2) -; ZIP-NEXT: vle16.v v11, (a4) -; ZIP-NEXT: vle16.v v12, (a3) ; ZIP-NEXT: vle16.v v8, (a0) -; ZIP-NEXT: add a1, a4, a1 +; ZIP-NEXT: add a0, a2, a1 +; ZIP-NEXT: vle16.v v9, (a2) +; ZIP-NEXT: add a2, a0, a1 +; ZIP-NEXT: vle16.v v11, (a0) +; ZIP-NEXT: vle16.v v12, (a2) +; ZIP-NEXT: add a1, a2, a1 ; ZIP-NEXT: vle16.v v10, (a1) ; ZIP-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZIP-NEXT: vslideup.vi v12, v11, 2 ; ZIP-NEXT: vslideup.vi v8, v9, 2 +; ZIP-NEXT: vslideup.vi v11, v12, 2 ; ZIP-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZIP-NEXT: vslideup.vi v8, v12, 4 +; ZIP-NEXT: vslideup.vi v8, v11, 4 ; ZIP-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; ZIP-NEXT: vslideup.vi v8, v10, 8 ; ZIP-NEXT: csrr a0, vlenb @@ -1277,26 +1295,27 @@ define <12 x half> @vector_interleave6_v12f16_v2f16(<2 x half> %a, <2 x half> %b ; CHECK-NEXT: sub sp, sp, a0 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; CHECK-NEXT: vsseg6e16.v v8, (a0) ; CHECK-NEXT: srli a1, a1, 2 ; CHECK-NEXT: add a2, a0, a1 -; CHECK-NEXT: add a3, a2, a1 -; CHECK-NEXT: vsetvli a4, zero, e16, mf4, ta, ma -; CHECK-NEXT: vsseg6e16.v v8, (a0) +; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: vle16.v v9, (a2) -; CHECK-NEXT: add a2, a3, a1 -; CHECK-NEXT: vle16.v v11, (a2) ; CHECK-NEXT: add a2, a2, a1 -; CHECK-NEXT: vle16.v v12, (a3) -; CHECK-NEXT: add a1, a2, a1 -; CHECK-NEXT: vle16.v v10, (a2) -; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: add a0, a2, a1 +; CHECK-NEXT: vle16.v v11, (a2) +; CHECK-NEXT: vle16.v v12, (a0) +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: add a1, a0, a1 +; CHECK-NEXT: vle16.v v10, (a0) ; CHECK-NEXT: vle16.v v13, (a1) ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vslideup.vi v12, v11, 2 ; CHECK-NEXT: vslideup.vi v8, v9, 2 -; CHECK-NEXT: vslideup.vi v10, v13, 2 +; CHECK-NEXT: vslideup.vi v11, v12, 2 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vslideup.vi v8, v12, 4 +; CHECK-NEXT: vslideup.vi v8, v11, 4 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vslideup.vi v10, v13, 2 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vslideup.vi v8, v10, 8 ; CHECK-NEXT: csrr a0, vlenb @@ -1313,26 +1332,27 @@ define <12 x half> @vector_interleave6_v12f16_v2f16(<2 x half> %a, <2 x half> %b ; ZVBB-NEXT: sub sp, sp, a0 ; ZVBB-NEXT: addi a0, sp, 16 ; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; ZVBB-NEXT: vsseg6e16.v v8, (a0) ; ZVBB-NEXT: srli a1, a1, 2 ; ZVBB-NEXT: add a2, a0, a1 -; ZVBB-NEXT: add a3, a2, a1 -; ZVBB-NEXT: vsetvli a4, zero, e16, mf4, ta, ma -; ZVBB-NEXT: vsseg6e16.v v8, (a0) +; ZVBB-NEXT: vle16.v v8, (a0) ; ZVBB-NEXT: vle16.v v9, (a2) -; ZVBB-NEXT: add a2, a3, a1 -; ZVBB-NEXT: vle16.v v11, (a2) ; ZVBB-NEXT: add a2, a2, a1 -; ZVBB-NEXT: vle16.v v12, (a3) -; ZVBB-NEXT: add a1, a2, a1 -; ZVBB-NEXT: vle16.v v10, (a2) -; ZVBB-NEXT: vle16.v v8, (a0) +; ZVBB-NEXT: add a0, a2, a1 +; ZVBB-NEXT: vle16.v v11, (a2) +; ZVBB-NEXT: vle16.v v12, (a0) +; ZVBB-NEXT: add a0, a0, a1 +; ZVBB-NEXT: add a1, a0, a1 +; ZVBB-NEXT: vle16.v v10, (a0) ; ZVBB-NEXT: vle16.v v13, (a1) ; ZVBB-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZVBB-NEXT: vslideup.vi v12, v11, 2 ; ZVBB-NEXT: vslideup.vi v8, v9, 2 -; ZVBB-NEXT: vslideup.vi v10, v13, 2 +; ZVBB-NEXT: vslideup.vi v11, v12, 2 ; ZVBB-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVBB-NEXT: vslideup.vi v8, v12, 4 +; ZVBB-NEXT: vslideup.vi v8, v11, 4 +; ZVBB-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; ZVBB-NEXT: vslideup.vi v10, v13, 2 ; ZVBB-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; ZVBB-NEXT: vslideup.vi v8, v10, 8 ; ZVBB-NEXT: csrr a0, vlenb @@ -1349,26 +1369,27 @@ define <12 x half> @vector_interleave6_v12f16_v2f16(<2 x half> %a, <2 x half> %b ; ZIP-NEXT: sub sp, sp, a0 ; ZIP-NEXT: addi a0, sp, 16 ; ZIP-NEXT: csrr a1, vlenb +; ZIP-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; ZIP-NEXT: vsseg6e16.v v8, (a0) ; ZIP-NEXT: srli a1, a1, 2 ; ZIP-NEXT: add a2, a0, a1 -; ZIP-NEXT: add a3, a2, a1 -; ZIP-NEXT: vsetvli a4, zero, e16, mf4, ta, ma -; ZIP-NEXT: vsseg6e16.v v8, (a0) +; ZIP-NEXT: vle16.v v8, (a0) ; ZIP-NEXT: vle16.v v9, (a2) -; ZIP-NEXT: add a2, a3, a1 -; ZIP-NEXT: vle16.v v11, (a2) ; ZIP-NEXT: add a2, a2, a1 -; ZIP-NEXT: vle16.v v12, (a3) -; ZIP-NEXT: add a1, a2, a1 -; ZIP-NEXT: vle16.v v10, (a2) -; ZIP-NEXT: vle16.v v8, (a0) +; ZIP-NEXT: add a0, a2, a1 +; ZIP-NEXT: vle16.v v11, (a2) +; ZIP-NEXT: vle16.v v12, (a0) +; ZIP-NEXT: add a0, a0, a1 +; ZIP-NEXT: add a1, a0, a1 +; ZIP-NEXT: vle16.v v10, (a0) ; ZIP-NEXT: vle16.v v13, (a1) ; ZIP-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZIP-NEXT: vslideup.vi v12, v11, 2 ; ZIP-NEXT: vslideup.vi v8, v9, 2 -; ZIP-NEXT: vslideup.vi v10, v13, 2 +; ZIP-NEXT: vslideup.vi v11, v12, 2 ; ZIP-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZIP-NEXT: vslideup.vi v8, v12, 4 +; ZIP-NEXT: vslideup.vi v8, v11, 4 +; ZIP-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; ZIP-NEXT: vslideup.vi v10, v13, 2 ; ZIP-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; ZIP-NEXT: vslideup.vi v8, v10, 8 ; ZIP-NEXT: csrr a0, vlenb @@ -1389,30 +1410,32 @@ define <7 x half> @vector_interleave7_v7f16_v1f16(<1 x half> %a, <1 x half> %b, ; CHECK-NEXT: sub sp, sp, a0 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; CHECK-NEXT: vsseg7e16.v v8, (a0) ; CHECK-NEXT: srli a1, a1, 2 ; CHECK-NEXT: add a2, a0, a1 -; CHECK-NEXT: add a3, a2, a1 -; CHECK-NEXT: add a4, a3, a1 -; CHECK-NEXT: vsetvli a5, zero, e16, mf4, ta, ma -; CHECK-NEXT: vsseg7e16.v v8, (a0) -; CHECK-NEXT: vle16.v v9, (a4) -; CHECK-NEXT: add a4, a4, a1 -; CHECK-NEXT: vle16.v v10, (a2) -; CHECK-NEXT: add a2, a4, a1 -; CHECK-NEXT: add a1, a2, a1 -; CHECK-NEXT: vle16.v v11, (a2) -; CHECK-NEXT: vle16.v v12, (a4) ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vle16.v v13, (a1) -; CHECK-NEXT: vle16.v v14, (a3) +; CHECK-NEXT: add a0, a2, a1 +; CHECK-NEXT: vle16.v v9, (a2) +; CHECK-NEXT: add a2, a0, a1 +; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vle16.v v11, (a2) +; CHECK-NEXT: add a2, a2, a1 +; CHECK-NEXT: add a0, a2, a1 +; CHECK-NEXT: vle16.v v12, (a2) +; CHECK-NEXT: add a1, a0, a1 +; CHECK-NEXT: vle16.v v13, (a0) +; CHECK-NEXT: vle16.v v14, (a1) ; CHECK-NEXT: vsetivli zero, 2, e16, mf2, tu, ma -; CHECK-NEXT: vslideup.vi v12, v11, 1 -; CHECK-NEXT: vslideup.vi v8, v10, 1 +; CHECK-NEXT: vslideup.vi v8, v9, 1 ; CHECK-NEXT: vsetivli zero, 3, e16, mf2, tu, ma -; CHECK-NEXT: vslideup.vi v12, v13, 2 -; CHECK-NEXT: vslideup.vi v8, v14, 2 +; CHECK-NEXT: vslideup.vi v8, v10, 2 +; CHECK-NEXT: vsetivli zero, 2, e16, mf2, tu, ma +; CHECK-NEXT: vslideup.vi v12, v13, 1 +; CHECK-NEXT: vsetivli zero, 3, e16, mf2, tu, ma +; CHECK-NEXT: vslideup.vi v12, v14, 2 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vslideup.vi v8, v9, 3 +; CHECK-NEXT: vslideup.vi v8, v11, 3 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vslideup.vi v8, v12, 4 ; CHECK-NEXT: csrr a0, vlenb @@ -1429,30 +1452,32 @@ define <7 x half> @vector_interleave7_v7f16_v1f16(<1 x half> %a, <1 x half> %b, ; ZVBB-NEXT: sub sp, sp, a0 ; ZVBB-NEXT: addi a0, sp, 16 ; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; ZVBB-NEXT: vsseg7e16.v v8, (a0) ; ZVBB-NEXT: srli a1, a1, 2 ; ZVBB-NEXT: add a2, a0, a1 -; ZVBB-NEXT: add a3, a2, a1 -; ZVBB-NEXT: add a4, a3, a1 -; ZVBB-NEXT: vsetvli a5, zero, e16, mf4, ta, ma -; ZVBB-NEXT: vsseg7e16.v v8, (a0) -; ZVBB-NEXT: vle16.v v9, (a4) -; ZVBB-NEXT: add a4, a4, a1 -; ZVBB-NEXT: vle16.v v10, (a2) -; ZVBB-NEXT: add a2, a4, a1 -; ZVBB-NEXT: add a1, a2, a1 -; ZVBB-NEXT: vle16.v v11, (a2) -; ZVBB-NEXT: vle16.v v12, (a4) ; ZVBB-NEXT: vle16.v v8, (a0) -; ZVBB-NEXT: vle16.v v13, (a1) -; ZVBB-NEXT: vle16.v v14, (a3) +; ZVBB-NEXT: add a0, a2, a1 +; ZVBB-NEXT: vle16.v v9, (a2) +; ZVBB-NEXT: add a2, a0, a1 +; ZVBB-NEXT: vle16.v v10, (a0) +; ZVBB-NEXT: vle16.v v11, (a2) +; ZVBB-NEXT: add a2, a2, a1 +; ZVBB-NEXT: add a0, a2, a1 +; ZVBB-NEXT: vle16.v v12, (a2) +; ZVBB-NEXT: add a1, a0, a1 +; ZVBB-NEXT: vle16.v v13, (a0) +; ZVBB-NEXT: vle16.v v14, (a1) ; ZVBB-NEXT: vsetivli zero, 2, e16, mf2, tu, ma -; ZVBB-NEXT: vslideup.vi v12, v11, 1 -; ZVBB-NEXT: vslideup.vi v8, v10, 1 +; ZVBB-NEXT: vslideup.vi v8, v9, 1 ; ZVBB-NEXT: vsetivli zero, 3, e16, mf2, tu, ma -; ZVBB-NEXT: vslideup.vi v12, v13, 2 -; ZVBB-NEXT: vslideup.vi v8, v14, 2 +; ZVBB-NEXT: vslideup.vi v8, v10, 2 +; ZVBB-NEXT: vsetivli zero, 2, e16, mf2, tu, ma +; ZVBB-NEXT: vslideup.vi v12, v13, 1 +; ZVBB-NEXT: vsetivli zero, 3, e16, mf2, tu, ma +; ZVBB-NEXT: vslideup.vi v12, v14, 2 ; ZVBB-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZVBB-NEXT: vslideup.vi v8, v9, 3 +; ZVBB-NEXT: vslideup.vi v8, v11, 3 ; ZVBB-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVBB-NEXT: vslideup.vi v8, v12, 4 ; ZVBB-NEXT: csrr a0, vlenb @@ -1469,30 +1494,32 @@ define <7 x half> @vector_interleave7_v7f16_v1f16(<1 x half> %a, <1 x half> %b, ; ZIP-NEXT: sub sp, sp, a0 ; ZIP-NEXT: addi a0, sp, 16 ; ZIP-NEXT: csrr a1, vlenb +; ZIP-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; ZIP-NEXT: vsseg7e16.v v8, (a0) ; ZIP-NEXT: srli a1, a1, 2 ; ZIP-NEXT: add a2, a0, a1 -; ZIP-NEXT: add a3, a2, a1 -; ZIP-NEXT: add a4, a3, a1 -; ZIP-NEXT: vsetvli a5, zero, e16, mf4, ta, ma -; ZIP-NEXT: vsseg7e16.v v8, (a0) -; ZIP-NEXT: vle16.v v9, (a4) -; ZIP-NEXT: add a4, a4, a1 -; ZIP-NEXT: vle16.v v10, (a2) -; ZIP-NEXT: add a2, a4, a1 -; ZIP-NEXT: add a1, a2, a1 -; ZIP-NEXT: vle16.v v11, (a2) -; ZIP-NEXT: vle16.v v12, (a4) ; ZIP-NEXT: vle16.v v8, (a0) -; ZIP-NEXT: vle16.v v13, (a1) -; ZIP-NEXT: vle16.v v14, (a3) +; ZIP-NEXT: add a0, a2, a1 +; ZIP-NEXT: vle16.v v9, (a2) +; ZIP-NEXT: add a2, a0, a1 +; ZIP-NEXT: vle16.v v10, (a0) +; ZIP-NEXT: vle16.v v11, (a2) +; ZIP-NEXT: add a2, a2, a1 +; ZIP-NEXT: add a0, a2, a1 +; ZIP-NEXT: vle16.v v12, (a2) +; ZIP-NEXT: add a1, a0, a1 +; ZIP-NEXT: vle16.v v13, (a0) +; ZIP-NEXT: vle16.v v14, (a1) ; ZIP-NEXT: vsetivli zero, 2, e16, mf2, tu, ma -; ZIP-NEXT: vslideup.vi v12, v11, 1 -; ZIP-NEXT: vslideup.vi v8, v10, 1 +; ZIP-NEXT: vslideup.vi v8, v9, 1 ; ZIP-NEXT: vsetivli zero, 3, e16, mf2, tu, ma -; ZIP-NEXT: vslideup.vi v12, v13, 2 -; ZIP-NEXT: vslideup.vi v8, v14, 2 +; ZIP-NEXT: vslideup.vi v8, v10, 2 +; ZIP-NEXT: vsetivli zero, 2, e16, mf2, tu, ma +; ZIP-NEXT: vslideup.vi v12, v13, 1 +; ZIP-NEXT: vsetivli zero, 3, e16, mf2, tu, ma +; ZIP-NEXT: vslideup.vi v12, v14, 2 ; ZIP-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZIP-NEXT: vslideup.vi v8, v9, 3 +; ZIP-NEXT: vslideup.vi v8, v11, 3 ; ZIP-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZIP-NEXT: vslideup.vi v8, v12, 4 ; ZIP-NEXT: csrr a0, vlenb @@ -1513,35 +1540,38 @@ define <8 x half> @vector_interleave8_v8f16_v1f16(<1 x half> %a, <1 x half> %b, ; CHECK-NEXT: sub sp, sp, a0 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; CHECK-NEXT: vsseg8e16.v v8, (a0) ; CHECK-NEXT: srli a1, a1, 2 ; CHECK-NEXT: add a2, a0, a1 -; CHECK-NEXT: add a3, a2, a1 -; CHECK-NEXT: add a4, a3, a1 -; CHECK-NEXT: add a5, a4, a1 -; CHECK-NEXT: add a6, a5, a1 -; CHECK-NEXT: vsetvli a7, zero, e16, mf4, ta, ma -; CHECK-NEXT: vsseg8e16.v v8, (a0) -; CHECK-NEXT: vle16.v v9, (a6) -; CHECK-NEXT: add a6, a6, a1 -; CHECK-NEXT: vle16.v v10, (a5) -; CHECK-NEXT: vle16.v v11, (a6) -; CHECK-NEXT: add a1, a6, a1 -; CHECK-NEXT: vle16.v v12, (a2) ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vle16.v v13, (a3) -; CHECK-NEXT: vle16.v v14, (a4) -; CHECK-NEXT: vle16.v v15, (a1) +; CHECK-NEXT: add a0, a2, a1 +; CHECK-NEXT: vle16.v v9, (a2) +; CHECK-NEXT: add a2, a0, a1 +; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: add a0, a2, a1 +; CHECK-NEXT: vle16.v v11, (a2) +; CHECK-NEXT: add a2, a0, a1 +; CHECK-NEXT: vle16.v v12, (a0) +; CHECK-NEXT: vle16.v v13, (a2) +; CHECK-NEXT: add a2, a2, a1 +; CHECK-NEXT: vsetivli zero, 2, e16, mf2, tu, ma +; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; CHECK-NEXT: vle16.v v9, (a2) +; CHECK-NEXT: add a1, a2, a1 +; CHECK-NEXT: vle16.v v14, (a1) +; CHECK-NEXT: vsetivli zero, 3, e16, mf2, tu, ma +; CHECK-NEXT: vslideup.vi v8, v10, 2 ; CHECK-NEXT: vsetivli zero, 2, e16, mf2, tu, ma -; CHECK-NEXT: vslideup.vi v10, v9, 1 -; CHECK-NEXT: vslideup.vi v8, v12, 1 +; CHECK-NEXT: vslideup.vi v12, v13, 1 ; CHECK-NEXT: vsetivli zero, 3, e16, mf2, tu, ma -; CHECK-NEXT: vslideup.vi v10, v11, 2 -; CHECK-NEXT: vslideup.vi v8, v13, 2 +; CHECK-NEXT: vslideup.vi v12, v9, 2 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vslideup.vi v10, v15, 3 -; CHECK-NEXT: vslideup.vi v8, v14, 3 +; CHECK-NEXT: vslideup.vi v12, v14, 3 +; CHECK-NEXT: vslideup.vi v8, v11, 3 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vslideup.vi v8, v10, 4 +; CHECK-NEXT: vslideup.vi v8, v12, 4 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 1 ; CHECK-NEXT: add sp, sp, a0 @@ -1556,35 +1586,38 @@ define <8 x half> @vector_interleave8_v8f16_v1f16(<1 x half> %a, <1 x half> %b, ; ZVBB-NEXT: sub sp, sp, a0 ; ZVBB-NEXT: addi a0, sp, 16 ; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; ZVBB-NEXT: vsseg8e16.v v8, (a0) ; ZVBB-NEXT: srli a1, a1, 2 ; ZVBB-NEXT: add a2, a0, a1 -; ZVBB-NEXT: add a3, a2, a1 -; ZVBB-NEXT: add a4, a3, a1 -; ZVBB-NEXT: add a5, a4, a1 -; ZVBB-NEXT: add a6, a5, a1 -; ZVBB-NEXT: vsetvli a7, zero, e16, mf4, ta, ma -; ZVBB-NEXT: vsseg8e16.v v8, (a0) -; ZVBB-NEXT: vle16.v v9, (a6) -; ZVBB-NEXT: add a6, a6, a1 -; ZVBB-NEXT: vle16.v v10, (a5) -; ZVBB-NEXT: vle16.v v11, (a6) -; ZVBB-NEXT: add a1, a6, a1 -; ZVBB-NEXT: vle16.v v12, (a2) ; ZVBB-NEXT: vle16.v v8, (a0) -; ZVBB-NEXT: vle16.v v13, (a3) -; ZVBB-NEXT: vle16.v v14, (a4) -; ZVBB-NEXT: vle16.v v15, (a1) +; ZVBB-NEXT: add a0, a2, a1 +; ZVBB-NEXT: vle16.v v9, (a2) +; ZVBB-NEXT: add a2, a0, a1 +; ZVBB-NEXT: vle16.v v10, (a0) +; ZVBB-NEXT: add a0, a2, a1 +; ZVBB-NEXT: vle16.v v11, (a2) +; ZVBB-NEXT: add a2, a0, a1 +; ZVBB-NEXT: vle16.v v12, (a0) +; ZVBB-NEXT: vle16.v v13, (a2) +; ZVBB-NEXT: add a2, a2, a1 ; ZVBB-NEXT: vsetivli zero, 2, e16, mf2, tu, ma -; ZVBB-NEXT: vslideup.vi v10, v9, 1 -; ZVBB-NEXT: vslideup.vi v8, v12, 1 +; ZVBB-NEXT: vslideup.vi v8, v9, 1 +; ZVBB-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; ZVBB-NEXT: vle16.v v9, (a2) +; ZVBB-NEXT: add a1, a2, a1 +; ZVBB-NEXT: vle16.v v14, (a1) ; ZVBB-NEXT: vsetivli zero, 3, e16, mf2, tu, ma -; ZVBB-NEXT: vslideup.vi v10, v11, 2 -; ZVBB-NEXT: vslideup.vi v8, v13, 2 +; ZVBB-NEXT: vslideup.vi v8, v10, 2 +; ZVBB-NEXT: vsetivli zero, 2, e16, mf2, tu, ma +; ZVBB-NEXT: vslideup.vi v12, v13, 1 +; ZVBB-NEXT: vsetivli zero, 3, e16, mf2, tu, ma +; ZVBB-NEXT: vslideup.vi v12, v9, 2 ; ZVBB-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZVBB-NEXT: vslideup.vi v10, v15, 3 -; ZVBB-NEXT: vslideup.vi v8, v14, 3 +; ZVBB-NEXT: vslideup.vi v12, v14, 3 +; ZVBB-NEXT: vslideup.vi v8, v11, 3 ; ZVBB-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVBB-NEXT: vslideup.vi v8, v10, 4 +; ZVBB-NEXT: vslideup.vi v8, v12, 4 ; ZVBB-NEXT: csrr a0, vlenb ; ZVBB-NEXT: slli a0, a0, 1 ; ZVBB-NEXT: add sp, sp, a0 @@ -1599,35 +1632,38 @@ define <8 x half> @vector_interleave8_v8f16_v1f16(<1 x half> %a, <1 x half> %b, ; ZIP-NEXT: sub sp, sp, a0 ; ZIP-NEXT: addi a0, sp, 16 ; ZIP-NEXT: csrr a1, vlenb +; ZIP-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; ZIP-NEXT: vsseg8e16.v v8, (a0) ; ZIP-NEXT: srli a1, a1, 2 ; ZIP-NEXT: add a2, a0, a1 -; ZIP-NEXT: add a3, a2, a1 -; ZIP-NEXT: add a4, a3, a1 -; ZIP-NEXT: add a5, a4, a1 -; ZIP-NEXT: add a6, a5, a1 -; ZIP-NEXT: vsetvli a7, zero, e16, mf4, ta, ma -; ZIP-NEXT: vsseg8e16.v v8, (a0) -; ZIP-NEXT: vle16.v v9, (a6) -; ZIP-NEXT: add a6, a6, a1 -; ZIP-NEXT: vle16.v v10, (a5) -; ZIP-NEXT: vle16.v v11, (a6) -; ZIP-NEXT: add a1, a6, a1 -; ZIP-NEXT: vle16.v v12, (a2) ; ZIP-NEXT: vle16.v v8, (a0) -; ZIP-NEXT: vle16.v v13, (a3) -; ZIP-NEXT: vle16.v v14, (a4) -; ZIP-NEXT: vle16.v v15, (a1) +; ZIP-NEXT: add a0, a2, a1 +; ZIP-NEXT: vle16.v v9, (a2) +; ZIP-NEXT: add a2, a0, a1 +; ZIP-NEXT: vle16.v v10, (a0) +; ZIP-NEXT: add a0, a2, a1 +; ZIP-NEXT: vle16.v v11, (a2) +; ZIP-NEXT: add a2, a0, a1 +; ZIP-NEXT: vle16.v v12, (a0) +; ZIP-NEXT: vle16.v v13, (a2) +; ZIP-NEXT: add a2, a2, a1 ; ZIP-NEXT: vsetivli zero, 2, e16, mf2, tu, ma -; ZIP-NEXT: vslideup.vi v10, v9, 1 -; ZIP-NEXT: vslideup.vi v8, v12, 1 +; ZIP-NEXT: vslideup.vi v8, v9, 1 +; ZIP-NEXT: vsetvli a0, zero, e16, mf4, ta, ma +; ZIP-NEXT: vle16.v v9, (a2) +; ZIP-NEXT: add a1, a2, a1 +; ZIP-NEXT: vle16.v v14, (a1) ; ZIP-NEXT: vsetivli zero, 3, e16, mf2, tu, ma -; ZIP-NEXT: vslideup.vi v10, v11, 2 -; ZIP-NEXT: vslideup.vi v8, v13, 2 +; ZIP-NEXT: vslideup.vi v8, v10, 2 +; ZIP-NEXT: vsetivli zero, 2, e16, mf2, tu, ma +; ZIP-NEXT: vslideup.vi v12, v13, 1 +; ZIP-NEXT: vsetivli zero, 3, e16, mf2, tu, ma +; ZIP-NEXT: vslideup.vi v12, v9, 2 ; ZIP-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZIP-NEXT: vslideup.vi v10, v15, 3 -; ZIP-NEXT: vslideup.vi v8, v14, 3 +; ZIP-NEXT: vslideup.vi v12, v14, 3 +; ZIP-NEXT: vslideup.vi v8, v11, 3 ; ZIP-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZIP-NEXT: vslideup.vi v8, v10, 4 +; ZIP-NEXT: vslideup.vi v8, v12, 4 ; ZIP-NEXT: csrr a0, vlenb ; ZIP-NEXT: slli a0, a0, 1 ; ZIP-NEXT: add sp, sp, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll index 2e2f12ac7f506..56dc8af1dd64f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll @@ -101,10 +101,10 @@ define void @vector_interleave_store_nxv16i64_nxv8i64( %a, @vector_interleave_nxv128i1_nxv64i1( @llvm.vector.interleave2.nxv128i1( %a, %b) @@ -303,8 +303,8 @@ define @vector_interleave_nxv128i8_nxv64i8( @vector_interleave_nxv64i16_nxv32i16( @vector_interleave_nxv32i32_nxv16i32( @vector_interleave_nxv16i64_nxv8i64( @vector_interleave_nxv16i64_nxv8i64( @vector_interleave_nxv16i64_nxv8i64( @vector_interleave_nxv48i1_nxv16i1( ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: vmerge.vim v16, v12, 1, v0 -; CHECK-NEXT: slli a2, a1, 1 ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: vmerge.vim v14, v12, 1, v0 -; CHECK-NEXT: add a3, a0, a2 +; CHECK-NEXT: slli a2, a1, 1 ; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: vmerge.vim v18, v12, 1, v0 -; CHECK-NEXT: add a2, a3, a2 +; CHECK-NEXT: add a3, a0, a2 ; CHECK-NEXT: vsseg3e8.v v14, (a0) +; CHECK-NEXT: add a2, a3, a2 ; CHECK-NEXT: vl2r.v v8, (a2) ; CHECK-NEXT: srli a2, a1, 1 ; CHECK-NEXT: vl2r.v v10, (a3) @@ -537,14 +537,14 @@ define @vector_interleave_nxv48i1_nxv16i1( ; ZVBB-NEXT: addi a0, sp, 16 ; ZVBB-NEXT: csrr a1, vlenb ; ZVBB-NEXT: vmerge.vim v16, v12, 1, v0 -; ZVBB-NEXT: slli a2, a1, 1 ; ZVBB-NEXT: vmv1r.v v0, v10 ; ZVBB-NEXT: vmerge.vim v14, v12, 1, v0 -; ZVBB-NEXT: add a3, a0, a2 +; ZVBB-NEXT: slli a2, a1, 1 ; ZVBB-NEXT: vmv1r.v v0, v9 ; ZVBB-NEXT: vmerge.vim v18, v12, 1, v0 -; ZVBB-NEXT: add a2, a3, a2 +; ZVBB-NEXT: add a3, a0, a2 ; ZVBB-NEXT: vsseg3e8.v v14, (a0) +; ZVBB-NEXT: add a2, a3, a2 ; ZVBB-NEXT: vl2r.v v8, (a2) ; ZVBB-NEXT: srli a2, a1, 1 ; ZVBB-NEXT: vl2r.v v10, (a3) @@ -578,9 +578,9 @@ define @vector_interleave_nxv48i8_nxv16i8( ; CHECK-NEXT: sub sp, sp, a0 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 1 ; CHECK-NEXT: vsetvli a2, zero, e8, m2, ta, ma ; CHECK-NEXT: vsseg3e8.v v8, (a0) +; CHECK-NEXT: slli a1, a1, 1 ; CHECK-NEXT: vl2r.v v8, (a0) ; CHECK-NEXT: add a0, a0, a1 ; CHECK-NEXT: vl2r.v v10, (a0) @@ -602,9 +602,9 @@ define @vector_interleave_nxv48i8_nxv16i8( ; ZVBB-NEXT: sub sp, sp, a0 ; ZVBB-NEXT: addi a0, sp, 16 ; ZVBB-NEXT: csrr a1, vlenb -; ZVBB-NEXT: slli a1, a1, 1 ; ZVBB-NEXT: vsetvli a2, zero, e8, m2, ta, ma ; ZVBB-NEXT: vsseg3e8.v v8, (a0) +; ZVBB-NEXT: slli a1, a1, 1 ; ZVBB-NEXT: vl2r.v v8, (a0) ; ZVBB-NEXT: add a0, a0, a1 ; ZVBB-NEXT: vl2r.v v10, (a0) @@ -631,9 +631,9 @@ define @vector_interleave_nxv24i16_nxv8i16( @vector_interleave_nxv24i16_nxv8i16( @vector_interleave_nxv12i32_nxv4i32( @vector_interleave_nxv12i32_nxv4i32( @vector_interleave_nxv6i64_nxv2i64( ; CHECK-NEXT: sub sp, sp, a0 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 1 ; CHECK-NEXT: vsetvli a2, zero, e64, m2, ta, ma ; CHECK-NEXT: vsseg3e64.v v8, (a0) +; CHECK-NEXT: slli a1, a1, 1 ; CHECK-NEXT: vl2re64.v v8, (a0) ; CHECK-NEXT: add a0, a0, a1 ; CHECK-NEXT: vl2re64.v v10, (a0) @@ -761,9 +761,9 @@ define @vector_interleave_nxv6i64_nxv2i64( ; ZVBB-NEXT: sub sp, sp, a0 ; ZVBB-NEXT: addi a0, sp, 16 ; ZVBB-NEXT: csrr a1, vlenb -; ZVBB-NEXT: slli a1, a1, 1 ; ZVBB-NEXT: vsetvli a2, zero, e64, m2, ta, ma ; ZVBB-NEXT: vsseg3e64.v v8, (a0) +; ZVBB-NEXT: slli a1, a1, 1 ; ZVBB-NEXT: vl2re64.v v8, (a0) ; ZVBB-NEXT: add a0, a0, a1 ; ZVBB-NEXT: vl2re64.v v10, (a0) @@ -793,17 +793,17 @@ define @vector_interleave_nxv64i1_nxv16i1( ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: vmerge.vim v16, v12, 1, v0 -; CHECK-NEXT: slli a2, a1, 1 ; CHECK-NEXT: vmv1r.v v0, v11 ; CHECK-NEXT: vmerge.vim v14, v12, 1, v0 -; CHECK-NEXT: add a3, a0, a2 +; CHECK-NEXT: slli a2, a1, 1 ; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: vmerge.vim v18, v12, 1, v0 -; CHECK-NEXT: add a4, a3, a2 +; CHECK-NEXT: add a3, a0, a2 ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: vmerge.vim v20, v12, 1, v0 -; CHECK-NEXT: add a2, a4, a2 +; CHECK-NEXT: add a4, a3, a2 ; CHECK-NEXT: vsseg4e8.v v14, (a0) +; CHECK-NEXT: add a2, a4, a2 ; CHECK-NEXT: vl2r.v v8, (a2) ; CHECK-NEXT: srli a2, a1, 1 ; CHECK-NEXT: srli a1, a1, 2 @@ -838,17 +838,17 @@ define @vector_interleave_nxv64i1_nxv16i1( ; ZVBB-NEXT: addi a0, sp, 16 ; ZVBB-NEXT: csrr a1, vlenb ; ZVBB-NEXT: vmerge.vim v16, v12, 1, v0 -; ZVBB-NEXT: slli a2, a1, 1 ; ZVBB-NEXT: vmv1r.v v0, v11 ; ZVBB-NEXT: vmerge.vim v14, v12, 1, v0 -; ZVBB-NEXT: add a3, a0, a2 +; ZVBB-NEXT: slli a2, a1, 1 ; ZVBB-NEXT: vmv1r.v v0, v9 ; ZVBB-NEXT: vmerge.vim v18, v12, 1, v0 -; ZVBB-NEXT: add a4, a3, a2 +; ZVBB-NEXT: add a3, a0, a2 ; ZVBB-NEXT: vmv1r.v v0, v10 ; ZVBB-NEXT: vmerge.vim v20, v12, 1, v0 -; ZVBB-NEXT: add a2, a4, a2 +; ZVBB-NEXT: add a4, a3, a2 ; ZVBB-NEXT: vsseg4e8.v v14, (a0) +; ZVBB-NEXT: add a2, a4, a2 ; ZVBB-NEXT: vl2r.v v8, (a2) ; ZVBB-NEXT: srli a2, a1, 1 ; ZVBB-NEXT: srli a1, a1, 2 @@ -883,15 +883,15 @@ define @vector_interleave_nxv64i8_nxv16i8( ; CHECK-NEXT: sub sp, sp, a0 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: vsetvli a2, zero, e8, m2, ta, ma +; CHECK-NEXT: vsseg4e8.v v8, (a0) ; CHECK-NEXT: slli a1, a1, 1 ; CHECK-NEXT: add a2, a0, a1 -; CHECK-NEXT: vsetvli a3, zero, e8, m2, ta, ma -; CHECK-NEXT: vsseg4e8.v v8, (a0) -; CHECK-NEXT: add a3, a2, a1 -; CHECK-NEXT: vl2r.v v12, (a3) -; CHECK-NEXT: add a1, a3, a1 -; CHECK-NEXT: vl2r.v v14, (a1) ; CHECK-NEXT: vl2r.v v8, (a0) +; CHECK-NEXT: add a0, a2, a1 +; CHECK-NEXT: vl2r.v v12, (a0) +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vl2r.v v14, (a0) ; CHECK-NEXT: vl2r.v v10, (a2) ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 @@ -907,15 +907,15 @@ define @vector_interleave_nxv64i8_nxv16i8( ; ZVBB-NEXT: sub sp, sp, a0 ; ZVBB-NEXT: addi a0, sp, 16 ; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: vsetvli a2, zero, e8, m2, ta, ma +; ZVBB-NEXT: vsseg4e8.v v8, (a0) ; ZVBB-NEXT: slli a1, a1, 1 ; ZVBB-NEXT: add a2, a0, a1 -; ZVBB-NEXT: vsetvli a3, zero, e8, m2, ta, ma -; ZVBB-NEXT: vsseg4e8.v v8, (a0) -; ZVBB-NEXT: add a3, a2, a1 -; ZVBB-NEXT: vl2r.v v12, (a3) -; ZVBB-NEXT: add a1, a3, a1 -; ZVBB-NEXT: vl2r.v v14, (a1) ; ZVBB-NEXT: vl2r.v v8, (a0) +; ZVBB-NEXT: add a0, a2, a1 +; ZVBB-NEXT: vl2r.v v12, (a0) +; ZVBB-NEXT: add a0, a0, a1 +; ZVBB-NEXT: vl2r.v v14, (a0) ; ZVBB-NEXT: vl2r.v v10, (a2) ; ZVBB-NEXT: csrr a0, vlenb ; ZVBB-NEXT: slli a0, a0, 3 @@ -935,14 +935,14 @@ define @vector_interleave_nxv32i8_nxv8i8( %a ; CHECK-NEXT: sub sp, sp, a0 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma +; CHECK-NEXT: vsseg4e8.v v8, (a0) ; CHECK-NEXT: add a2, a0, a1 ; CHECK-NEXT: add a3, a2, a1 -; CHECK-NEXT: vsetvli a4, zero, e8, m1, ta, ma -; CHECK-NEXT: vsseg4e8.v v8, (a0) +; CHECK-NEXT: vl1r.v v8, (a0) ; CHECK-NEXT: vl1r.v v10, (a3) ; CHECK-NEXT: add a1, a3, a1 ; CHECK-NEXT: vl1r.v v11, (a1) -; CHECK-NEXT: vl1r.v v8, (a0) ; CHECK-NEXT: vl1r.v v9, (a2) ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 2 @@ -958,14 +958,14 @@ define @vector_interleave_nxv32i8_nxv8i8( %a ; ZVBB-NEXT: sub sp, sp, a0 ; ZVBB-NEXT: addi a0, sp, 16 ; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: vsetvli a2, zero, e8, m1, ta, ma +; ZVBB-NEXT: vsseg4e8.v v8, (a0) ; ZVBB-NEXT: add a2, a0, a1 ; ZVBB-NEXT: add a3, a2, a1 -; ZVBB-NEXT: vsetvli a4, zero, e8, m1, ta, ma -; ZVBB-NEXT: vsseg4e8.v v8, (a0) +; ZVBB-NEXT: vl1r.v v8, (a0) ; ZVBB-NEXT: vl1r.v v10, (a3) ; ZVBB-NEXT: add a1, a3, a1 ; ZVBB-NEXT: vl1r.v v11, (a1) -; ZVBB-NEXT: vl1r.v v8, (a0) ; ZVBB-NEXT: vl1r.v v9, (a2) ; ZVBB-NEXT: csrr a0, vlenb ; ZVBB-NEXT: slli a0, a0, 2 @@ -986,15 +986,15 @@ define @vector_interleave_nxv16i32_nxv4i32( @vector_interleave_nxv16i32_nxv4i32( @vector_interleave_nxv8i64_nxv2i64( ; CHECK-NEXT: sub sp, sp, a0 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: vsetvli a2, zero, e64, m2, ta, ma +; CHECK-NEXT: vsseg4e64.v v8, (a0) ; CHECK-NEXT: slli a1, a1, 1 ; CHECK-NEXT: add a2, a0, a1 -; CHECK-NEXT: vsetvli a3, zero, e64, m2, ta, ma -; CHECK-NEXT: vsseg4e64.v v8, (a0) -; CHECK-NEXT: add a3, a2, a1 -; CHECK-NEXT: vl2re64.v v12, (a3) -; CHECK-NEXT: add a1, a3, a1 -; CHECK-NEXT: vl2re64.v v14, (a1) ; CHECK-NEXT: vl2re64.v v8, (a0) +; CHECK-NEXT: add a0, a2, a1 +; CHECK-NEXT: vl2re64.v v12, (a0) +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vl2re64.v v14, (a0) ; CHECK-NEXT: vl2re64.v v10, (a2) ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 @@ -1063,15 +1063,15 @@ define @vector_interleave_nxv8i64_nxv2i64( ; ZVBB-NEXT: sub sp, sp, a0 ; ZVBB-NEXT: addi a0, sp, 16 ; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: vsetvli a2, zero, e64, m2, ta, ma +; ZVBB-NEXT: vsseg4e64.v v8, (a0) ; ZVBB-NEXT: slli a1, a1, 1 ; ZVBB-NEXT: add a2, a0, a1 -; ZVBB-NEXT: vsetvli a3, zero, e64, m2, ta, ma -; ZVBB-NEXT: vsseg4e64.v v8, (a0) -; ZVBB-NEXT: add a3, a2, a1 -; ZVBB-NEXT: vl2re64.v v12, (a3) -; ZVBB-NEXT: add a1, a3, a1 -; ZVBB-NEXT: vl2re64.v v14, (a1) ; ZVBB-NEXT: vl2re64.v v8, (a0) +; ZVBB-NEXT: add a0, a2, a1 +; ZVBB-NEXT: vl2re64.v v12, (a0) +; ZVBB-NEXT: add a0, a0, a1 +; ZVBB-NEXT: vl2re64.v v14, (a0) ; ZVBB-NEXT: vl2re64.v v10, (a2) ; ZVBB-NEXT: csrr a0, vlenb ; ZVBB-NEXT: slli a0, a0, 3 @@ -1092,7 +1092,7 @@ define @vector_interleave_nxv80i1_nxv16i1( ; CHECK-NEXT: sub sp, sp, a0 ; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma ; CHECK-NEXT: vmv.v.i v12, 0 -; CHECK-NEXT: addi a4, sp, 16 +; CHECK-NEXT: addi a3, sp, 16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a1, a0, 2 ; CHECK-NEXT: add a0, a1, a0 @@ -1102,52 +1102,52 @@ define @vector_interleave_nxv80i1_nxv16i1( ; CHECK-NEXT: vmerge.vim v14, v12, 1, v0 ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vmerge.vim v18, v12, 1, v0 -; CHECK-NEXT: add a2, a4, a1 -; CHECK-NEXT: srli a3, a1, 1 +; CHECK-NEXT: add a4, a3, a1 +; CHECK-NEXT: srli a2, a1, 1 ; CHECK-NEXT: vmv2r.v v20, v14 ; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: vmerge.vim v16, v12, 1, v0 -; CHECK-NEXT: vmv1r.v v21, v18 ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: vmerge.vim v8, v12, 1, v0 +; CHECK-NEXT: srli a5, a1, 2 +; CHECK-NEXT: vmv1r.v v21, v18 ; CHECK-NEXT: vmv1r.v v22, v16 ; CHECK-NEXT: vmv1r.v v16, v19 -; CHECK-NEXT: add a5, a2, a1 ; CHECK-NEXT: vmv1r.v v23, v8 -; CHECK-NEXT: vmv1r.v v18, v9 ; CHECK-NEXT: vmv1r.v v0, v11 ; CHECK-NEXT: vmerge.vim v24, v12, 1, v0 +; CHECK-NEXT: vmv1r.v v18, v9 ; CHECK-NEXT: vsetvli a6, zero, e8, m1, ta, ma -; CHECK-NEXT: vsseg5e8.v v20, (a4) +; CHECK-NEXT: vsseg5e8.v v20, (a3) ; CHECK-NEXT: vmv1r.v v19, v25 ; CHECK-NEXT: vsseg5e8.v v15, (a0) -; CHECK-NEXT: vl1r.v v8, (a5) -; CHECK-NEXT: add a5, a5, a1 -; CHECK-NEXT: vl1r.v v10, (a4) -; CHECK-NEXT: add a4, a5, a1 +; CHECK-NEXT: vl1r.v v8, (a3) +; CHECK-NEXT: add a3, a0, a1 +; CHECK-NEXT: vl1r.v v9, (a4) +; CHECK-NEXT: add a4, a4, a1 +; CHECK-NEXT: vl1r.v v10, (a3) +; CHECK-NEXT: add a3, a3, a1 ; CHECK-NEXT: vl1r.v v12, (a4) -; CHECK-NEXT: add a4, a0, a1 -; CHECK-NEXT: vl1r.v v14, (a4) ; CHECK-NEXT: add a4, a4, a1 -; CHECK-NEXT: vl1r.v v9, (a5) -; CHECK-NEXT: add a5, a4, a1 -; CHECK-NEXT: vl1r.v v16, (a5) -; CHECK-NEXT: add a5, a5, a1 -; CHECK-NEXT: srli a1, a1, 2 -; CHECK-NEXT: vl1r.v v11, (a2) -; CHECK-NEXT: vl1r.v v15, (a4) -; CHECK-NEXT: vl1r.v v13, (a0) -; CHECK-NEXT: vl1r.v v17, (a5) +; CHECK-NEXT: vl1r.v v11, (a3) +; CHECK-NEXT: add a3, a3, a1 +; CHECK-NEXT: vl1r.v v13, (a4) +; CHECK-NEXT: add a4, a4, a1 +; CHECK-NEXT: add a1, a3, a1 +; CHECK-NEXT: vl1r.v v14, (a4) +; CHECK-NEXT: vl1r.v v15, (a0) +; CHECK-NEXT: vl1r.v v16, (a3) +; CHECK-NEXT: vl1r.v v17, (a1) ; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; CHECK-NEXT: vmsne.vi v18, v8, 0 -; CHECK-NEXT: vmsne.vi v0, v10, 0 -; CHECK-NEXT: vmsne.vi v8, v14, 0 -; CHECK-NEXT: vmsne.vi v9, v12, 0 +; CHECK-NEXT: vmsne.vi v18, v12, 0 +; CHECK-NEXT: vmsne.vi v0, v8, 0 +; CHECK-NEXT: vmsne.vi v8, v10, 0 +; CHECK-NEXT: vmsne.vi v9, v14, 0 ; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma -; CHECK-NEXT: vslideup.vx v0, v18, a1 -; CHECK-NEXT: vslideup.vx v9, v8, a1 +; CHECK-NEXT: vslideup.vx v0, v18, a5 +; CHECK-NEXT: vslideup.vx v9, v8, a5 ; CHECK-NEXT: vsetvli a0, zero, e8, m1, ta, ma -; CHECK-NEXT: vslideup.vx v0, v9, a3 +; CHECK-NEXT: vslideup.vx v0, v9, a2 ; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma ; CHECK-NEXT: vmsne.vi v8, v16, 0 ; CHECK-NEXT: csrr a0, vlenb @@ -1166,7 +1166,7 @@ define @vector_interleave_nxv80i1_nxv16i1( ; ZVBB-NEXT: sub sp, sp, a0 ; ZVBB-NEXT: vsetvli a0, zero, e8, m2, ta, ma ; ZVBB-NEXT: vmv.v.i v12, 0 -; ZVBB-NEXT: addi a4, sp, 16 +; ZVBB-NEXT: addi a3, sp, 16 ; ZVBB-NEXT: csrr a0, vlenb ; ZVBB-NEXT: slli a1, a0, 2 ; ZVBB-NEXT: add a0, a1, a0 @@ -1176,52 +1176,52 @@ define @vector_interleave_nxv80i1_nxv16i1( ; ZVBB-NEXT: vmerge.vim v14, v12, 1, v0 ; ZVBB-NEXT: vmv1r.v v0, v8 ; ZVBB-NEXT: vmerge.vim v18, v12, 1, v0 -; ZVBB-NEXT: add a2, a4, a1 -; ZVBB-NEXT: srli a3, a1, 1 +; ZVBB-NEXT: add a4, a3, a1 +; ZVBB-NEXT: srli a2, a1, 1 ; ZVBB-NEXT: vmv2r.v v20, v14 ; ZVBB-NEXT: vmv1r.v v0, v9 ; ZVBB-NEXT: vmerge.vim v16, v12, 1, v0 -; ZVBB-NEXT: vmv1r.v v21, v18 ; ZVBB-NEXT: vmv1r.v v0, v10 ; ZVBB-NEXT: vmerge.vim v8, v12, 1, v0 +; ZVBB-NEXT: srli a5, a1, 2 +; ZVBB-NEXT: vmv1r.v v21, v18 ; ZVBB-NEXT: vmv1r.v v22, v16 ; ZVBB-NEXT: vmv1r.v v16, v19 -; ZVBB-NEXT: add a5, a2, a1 ; ZVBB-NEXT: vmv1r.v v23, v8 -; ZVBB-NEXT: vmv1r.v v18, v9 ; ZVBB-NEXT: vmv1r.v v0, v11 ; ZVBB-NEXT: vmerge.vim v24, v12, 1, v0 +; ZVBB-NEXT: vmv1r.v v18, v9 ; ZVBB-NEXT: vsetvli a6, zero, e8, m1, ta, ma -; ZVBB-NEXT: vsseg5e8.v v20, (a4) +; ZVBB-NEXT: vsseg5e8.v v20, (a3) ; ZVBB-NEXT: vmv1r.v v19, v25 ; ZVBB-NEXT: vsseg5e8.v v15, (a0) -; ZVBB-NEXT: vl1r.v v8, (a5) -; ZVBB-NEXT: add a5, a5, a1 -; ZVBB-NEXT: vl1r.v v10, (a4) -; ZVBB-NEXT: add a4, a5, a1 +; ZVBB-NEXT: vl1r.v v8, (a3) +; ZVBB-NEXT: add a3, a0, a1 +; ZVBB-NEXT: vl1r.v v9, (a4) +; ZVBB-NEXT: add a4, a4, a1 +; ZVBB-NEXT: vl1r.v v10, (a3) +; ZVBB-NEXT: add a3, a3, a1 ; ZVBB-NEXT: vl1r.v v12, (a4) -; ZVBB-NEXT: add a4, a0, a1 -; ZVBB-NEXT: vl1r.v v14, (a4) ; ZVBB-NEXT: add a4, a4, a1 -; ZVBB-NEXT: vl1r.v v9, (a5) -; ZVBB-NEXT: add a5, a4, a1 -; ZVBB-NEXT: vl1r.v v16, (a5) -; ZVBB-NEXT: add a5, a5, a1 -; ZVBB-NEXT: srli a1, a1, 2 -; ZVBB-NEXT: vl1r.v v11, (a2) -; ZVBB-NEXT: vl1r.v v15, (a4) -; ZVBB-NEXT: vl1r.v v13, (a0) -; ZVBB-NEXT: vl1r.v v17, (a5) +; ZVBB-NEXT: vl1r.v v11, (a3) +; ZVBB-NEXT: add a3, a3, a1 +; ZVBB-NEXT: vl1r.v v13, (a4) +; ZVBB-NEXT: add a4, a4, a1 +; ZVBB-NEXT: add a1, a3, a1 +; ZVBB-NEXT: vl1r.v v14, (a4) +; ZVBB-NEXT: vl1r.v v15, (a0) +; ZVBB-NEXT: vl1r.v v16, (a3) +; ZVBB-NEXT: vl1r.v v17, (a1) ; ZVBB-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; ZVBB-NEXT: vmsne.vi v18, v8, 0 -; ZVBB-NEXT: vmsne.vi v0, v10, 0 -; ZVBB-NEXT: vmsne.vi v8, v14, 0 -; ZVBB-NEXT: vmsne.vi v9, v12, 0 +; ZVBB-NEXT: vmsne.vi v18, v12, 0 +; ZVBB-NEXT: vmsne.vi v0, v8, 0 +; ZVBB-NEXT: vmsne.vi v8, v10, 0 +; ZVBB-NEXT: vmsne.vi v9, v14, 0 ; ZVBB-NEXT: vsetvli a0, zero, e8, mf2, ta, ma -; ZVBB-NEXT: vslideup.vx v0, v18, a1 -; ZVBB-NEXT: vslideup.vx v9, v8, a1 +; ZVBB-NEXT: vslideup.vx v0, v18, a5 +; ZVBB-NEXT: vslideup.vx v9, v8, a5 ; ZVBB-NEXT: vsetvli a0, zero, e8, m1, ta, ma -; ZVBB-NEXT: vslideup.vx v0, v9, a3 +; ZVBB-NEXT: vslideup.vx v0, v9, a2 ; ZVBB-NEXT: vsetvli a0, zero, e8, m2, ta, ma ; ZVBB-NEXT: vmsne.vi v8, v16, 0 ; ZVBB-NEXT: csrr a0, vlenb @@ -1250,53 +1250,53 @@ define @vector_interleave_nxv80i8_nxv16i8( ; RV32-NEXT: andi sp, sp, -64 ; RV32-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; RV32-NEXT: vmv2r.v v20, v16 -; RV32-NEXT: addi a0, sp, 64 ; RV32-NEXT: vmv2r.v v18, v12 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a2, a1, 2 -; RV32-NEXT: add a1, a2, a1 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 64 -; RV32-NEXT: csrr a2, vlenb ; RV32-NEXT: vmv2r.v v16, v8 ; RV32-NEXT: vmv2r.v v22, v16 ; RV32-NEXT: vmv2r.v v24, v18 ; RV32-NEXT: vmv1r.v v26, v20 -; RV32-NEXT: add a3, a0, a2 +; RV32-NEXT: addi a1, sp, 64 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a0, a2, 2 +; RV32-NEXT: add a2, a0, a2 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 64 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: li a4, 10 +; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 64 ; RV32-NEXT: vmv1r.v v23, v10 -; RV32-NEXT: add a4, a1, a2 -; RV32-NEXT: add a5, a4, a2 ; RV32-NEXT: vmv1r.v v25, v14 -; RV32-NEXT: add a6, a5, a2 +; RV32-NEXT: vsseg5e8.v v22, (a1) ; RV32-NEXT: vmv1r.v v18, v11 -; RV32-NEXT: vsseg5e8.v v22, (a0) ; RV32-NEXT: vmv1r.v v20, v15 -; RV32-NEXT: vsseg5e8.v v17, (a1) -; RV32-NEXT: vl1r.v v16, (a6) -; RV32-NEXT: add a6, a6, a2 -; RV32-NEXT: vl1r.v v17, (a6) -; RV32-NEXT: add a6, a3, a2 -; RV32-NEXT: vl1r.v v10, (a6) -; RV32-NEXT: add a6, a6, a2 -; RV32-NEXT: vl1r.v v11, (a6) -; RV32-NEXT: vl1r.v v8, (a0) -; RV32-NEXT: vl1r.v v9, (a3) -; RV32-NEXT: vl1r.v v14, (a4) -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a3, 10 -; RV32-NEXT: mul a0, a0, a3 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 64 -; RV32-NEXT: add a6, a6, a2 -; RV32-NEXT: vl1r.v v15, (a5) -; RV32-NEXT: vl1r.v v12, (a6) -; RV32-NEXT: vl1r.v v13, (a1) -; RV32-NEXT: slli a2, a2, 3 -; RV32-NEXT: add a2, a0, a2 -; RV32-NEXT: vs2r.v v16, (a2) -; RV32-NEXT: vs8r.v v8, (a0) -; RV32-NEXT: vl8r.v v16, (a2) -; RV32-NEXT: vl8r.v v8, (a0) +; RV32-NEXT: vsseg5e8.v v17, (a2) +; RV32-NEXT: add a4, a1, a0 +; RV32-NEXT: vl1r.v v8, (a1) +; RV32-NEXT: add a1, a2, a0 +; RV32-NEXT: vl1r.v v9, (a4) +; RV32-NEXT: add a4, a4, a0 +; RV32-NEXT: vl1r.v v14, (a1) +; RV32-NEXT: add a1, a1, a0 +; RV32-NEXT: vl1r.v v13, (a2) +; RV32-NEXT: vl1r.v v10, (a4) +; RV32-NEXT: add a4, a4, a0 +; RV32-NEXT: vl1r.v v15, (a1) +; RV32-NEXT: add a1, a1, a0 +; RV32-NEXT: vl1r.v v11, (a4) +; RV32-NEXT: add a4, a4, a0 +; RV32-NEXT: vl1r.v v16, (a1) +; RV32-NEXT: add a1, a1, a0 +; RV32-NEXT: vl1r.v v17, (a1) +; RV32-NEXT: vl1r.v v12, (a4) +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, a3, a0 +; RV32-NEXT: vs2r.v v16, (a0) +; RV32-NEXT: vs8r.v v8, (a3) +; RV32-NEXT: vl8r.v v16, (a0) +; RV32-NEXT: vl8r.v v8, (a3) ; RV32-NEXT: addi sp, s0, -80 ; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload @@ -1316,53 +1316,53 @@ define @vector_interleave_nxv80i8_nxv16i8( ; RV64-NEXT: andi sp, sp, -64 ; RV64-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; RV64-NEXT: vmv2r.v v20, v16 -; RV64-NEXT: addi a0, sp, 64 ; RV64-NEXT: vmv2r.v v18, v12 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a2, a1, 2 -; RV64-NEXT: add a1, a2, a1 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 64 -; RV64-NEXT: csrr a2, vlenb ; RV64-NEXT: vmv2r.v v16, v8 ; RV64-NEXT: vmv2r.v v22, v16 ; RV64-NEXT: vmv2r.v v24, v18 ; RV64-NEXT: vmv1r.v v26, v20 -; RV64-NEXT: add a3, a0, a2 +; RV64-NEXT: addi a1, sp, 64 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a0, a2, 2 +; RV64-NEXT: add a2, a0, a2 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 64 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: li a4, 10 +; RV64-NEXT: mul a3, a3, a4 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 64 ; RV64-NEXT: vmv1r.v v23, v10 -; RV64-NEXT: add a4, a1, a2 -; RV64-NEXT: add a5, a4, a2 ; RV64-NEXT: vmv1r.v v25, v14 -; RV64-NEXT: add a6, a5, a2 +; RV64-NEXT: vsseg5e8.v v22, (a1) ; RV64-NEXT: vmv1r.v v18, v11 -; RV64-NEXT: vsseg5e8.v v22, (a0) ; RV64-NEXT: vmv1r.v v20, v15 -; RV64-NEXT: vsseg5e8.v v17, (a1) -; RV64-NEXT: vl1r.v v16, (a6) -; RV64-NEXT: add a6, a6, a2 -; RV64-NEXT: vl1r.v v17, (a6) -; RV64-NEXT: add a6, a3, a2 -; RV64-NEXT: vl1r.v v10, (a6) -; RV64-NEXT: add a6, a6, a2 -; RV64-NEXT: vl1r.v v11, (a6) -; RV64-NEXT: vl1r.v v8, (a0) -; RV64-NEXT: vl1r.v v9, (a3) -; RV64-NEXT: vl1r.v v14, (a4) -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: li a3, 10 -; RV64-NEXT: mul a0, a0, a3 -; RV64-NEXT: add a0, sp, a0 -; RV64-NEXT: addi a0, a0, 64 -; RV64-NEXT: add a6, a6, a2 -; RV64-NEXT: vl1r.v v15, (a5) -; RV64-NEXT: vl1r.v v12, (a6) -; RV64-NEXT: vl1r.v v13, (a1) -; RV64-NEXT: slli a2, a2, 3 -; RV64-NEXT: add a2, a0, a2 -; RV64-NEXT: vs2r.v v16, (a2) -; RV64-NEXT: vs8r.v v8, (a0) -; RV64-NEXT: vl8r.v v16, (a2) -; RV64-NEXT: vl8r.v v8, (a0) +; RV64-NEXT: vsseg5e8.v v17, (a2) +; RV64-NEXT: add a4, a1, a0 +; RV64-NEXT: vl1r.v v8, (a1) +; RV64-NEXT: add a1, a2, a0 +; RV64-NEXT: vl1r.v v9, (a4) +; RV64-NEXT: add a4, a4, a0 +; RV64-NEXT: vl1r.v v14, (a1) +; RV64-NEXT: add a1, a1, a0 +; RV64-NEXT: vl1r.v v13, (a2) +; RV64-NEXT: vl1r.v v10, (a4) +; RV64-NEXT: add a4, a4, a0 +; RV64-NEXT: vl1r.v v15, (a1) +; RV64-NEXT: add a1, a1, a0 +; RV64-NEXT: vl1r.v v11, (a4) +; RV64-NEXT: add a4, a4, a0 +; RV64-NEXT: vl1r.v v16, (a1) +; RV64-NEXT: add a1, a1, a0 +; RV64-NEXT: vl1r.v v17, (a1) +; RV64-NEXT: vl1r.v v12, (a4) +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, a3, a0 +; RV64-NEXT: vs2r.v v16, (a0) +; RV64-NEXT: vs8r.v v8, (a3) +; RV64-NEXT: vl8r.v v16, (a0) +; RV64-NEXT: vl8r.v v8, (a3) ; RV64-NEXT: addi sp, s0, -80 ; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload ; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload @@ -1382,53 +1382,53 @@ define @vector_interleave_nxv80i8_nxv16i8( ; ZVBB-RV32-NEXT: andi sp, sp, -64 ; ZVBB-RV32-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; ZVBB-RV32-NEXT: vmv2r.v v20, v16 -; ZVBB-RV32-NEXT: addi a0, sp, 64 ; ZVBB-RV32-NEXT: vmv2r.v v18, v12 -; ZVBB-RV32-NEXT: csrr a1, vlenb -; ZVBB-RV32-NEXT: slli a2, a1, 2 -; ZVBB-RV32-NEXT: add a1, a2, a1 -; ZVBB-RV32-NEXT: add a1, sp, a1 -; ZVBB-RV32-NEXT: addi a1, a1, 64 -; ZVBB-RV32-NEXT: csrr a2, vlenb ; ZVBB-RV32-NEXT: vmv2r.v v16, v8 ; ZVBB-RV32-NEXT: vmv2r.v v22, v16 ; ZVBB-RV32-NEXT: vmv2r.v v24, v18 ; ZVBB-RV32-NEXT: vmv1r.v v26, v20 -; ZVBB-RV32-NEXT: add a3, a0, a2 +; ZVBB-RV32-NEXT: addi a1, sp, 64 +; ZVBB-RV32-NEXT: csrr a2, vlenb +; ZVBB-RV32-NEXT: slli a0, a2, 2 +; ZVBB-RV32-NEXT: add a2, a0, a2 +; ZVBB-RV32-NEXT: add a2, sp, a2 +; ZVBB-RV32-NEXT: addi a2, a2, 64 +; ZVBB-RV32-NEXT: csrr a0, vlenb +; ZVBB-RV32-NEXT: csrr a3, vlenb +; ZVBB-RV32-NEXT: li a4, 10 +; ZVBB-RV32-NEXT: mul a3, a3, a4 +; ZVBB-RV32-NEXT: add a3, sp, a3 +; ZVBB-RV32-NEXT: addi a3, a3, 64 ; ZVBB-RV32-NEXT: vmv1r.v v23, v10 -; ZVBB-RV32-NEXT: add a4, a1, a2 -; ZVBB-RV32-NEXT: add a5, a4, a2 ; ZVBB-RV32-NEXT: vmv1r.v v25, v14 -; ZVBB-RV32-NEXT: add a6, a5, a2 +; ZVBB-RV32-NEXT: vsseg5e8.v v22, (a1) ; ZVBB-RV32-NEXT: vmv1r.v v18, v11 -; ZVBB-RV32-NEXT: vsseg5e8.v v22, (a0) ; ZVBB-RV32-NEXT: vmv1r.v v20, v15 -; ZVBB-RV32-NEXT: vsseg5e8.v v17, (a1) -; ZVBB-RV32-NEXT: vl1r.v v16, (a6) -; ZVBB-RV32-NEXT: add a6, a6, a2 -; ZVBB-RV32-NEXT: vl1r.v v17, (a6) -; ZVBB-RV32-NEXT: add a6, a3, a2 -; ZVBB-RV32-NEXT: vl1r.v v10, (a6) -; ZVBB-RV32-NEXT: add a6, a6, a2 -; ZVBB-RV32-NEXT: vl1r.v v11, (a6) -; ZVBB-RV32-NEXT: vl1r.v v8, (a0) -; ZVBB-RV32-NEXT: vl1r.v v9, (a3) -; ZVBB-RV32-NEXT: vl1r.v v14, (a4) -; ZVBB-RV32-NEXT: csrr a0, vlenb -; ZVBB-RV32-NEXT: li a3, 10 -; ZVBB-RV32-NEXT: mul a0, a0, a3 -; ZVBB-RV32-NEXT: add a0, sp, a0 -; ZVBB-RV32-NEXT: addi a0, a0, 64 -; ZVBB-RV32-NEXT: add a6, a6, a2 -; ZVBB-RV32-NEXT: vl1r.v v15, (a5) -; ZVBB-RV32-NEXT: vl1r.v v12, (a6) -; ZVBB-RV32-NEXT: vl1r.v v13, (a1) -; ZVBB-RV32-NEXT: slli a2, a2, 3 -; ZVBB-RV32-NEXT: add a2, a0, a2 -; ZVBB-RV32-NEXT: vs2r.v v16, (a2) -; ZVBB-RV32-NEXT: vs8r.v v8, (a0) -; ZVBB-RV32-NEXT: vl8r.v v16, (a2) -; ZVBB-RV32-NEXT: vl8r.v v8, (a0) +; ZVBB-RV32-NEXT: vsseg5e8.v v17, (a2) +; ZVBB-RV32-NEXT: add a4, a1, a0 +; ZVBB-RV32-NEXT: vl1r.v v8, (a1) +; ZVBB-RV32-NEXT: add a1, a2, a0 +; ZVBB-RV32-NEXT: vl1r.v v9, (a4) +; ZVBB-RV32-NEXT: add a4, a4, a0 +; ZVBB-RV32-NEXT: vl1r.v v14, (a1) +; ZVBB-RV32-NEXT: add a1, a1, a0 +; ZVBB-RV32-NEXT: vl1r.v v13, (a2) +; ZVBB-RV32-NEXT: vl1r.v v10, (a4) +; ZVBB-RV32-NEXT: add a4, a4, a0 +; ZVBB-RV32-NEXT: vl1r.v v15, (a1) +; ZVBB-RV32-NEXT: add a1, a1, a0 +; ZVBB-RV32-NEXT: vl1r.v v11, (a4) +; ZVBB-RV32-NEXT: add a4, a4, a0 +; ZVBB-RV32-NEXT: vl1r.v v16, (a1) +; ZVBB-RV32-NEXT: add a1, a1, a0 +; ZVBB-RV32-NEXT: vl1r.v v17, (a1) +; ZVBB-RV32-NEXT: vl1r.v v12, (a4) +; ZVBB-RV32-NEXT: slli a0, a0, 3 +; ZVBB-RV32-NEXT: add a0, a3, a0 +; ZVBB-RV32-NEXT: vs2r.v v16, (a0) +; ZVBB-RV32-NEXT: vs8r.v v8, (a3) +; ZVBB-RV32-NEXT: vl8r.v v16, (a0) +; ZVBB-RV32-NEXT: vl8r.v v8, (a3) ; ZVBB-RV32-NEXT: addi sp, s0, -80 ; ZVBB-RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload ; ZVBB-RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload @@ -1448,53 +1448,53 @@ define @vector_interleave_nxv80i8_nxv16i8( ; ZVBB-RV64-NEXT: andi sp, sp, -64 ; ZVBB-RV64-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; ZVBB-RV64-NEXT: vmv2r.v v20, v16 -; ZVBB-RV64-NEXT: addi a0, sp, 64 ; ZVBB-RV64-NEXT: vmv2r.v v18, v12 -; ZVBB-RV64-NEXT: csrr a1, vlenb -; ZVBB-RV64-NEXT: slli a2, a1, 2 -; ZVBB-RV64-NEXT: add a1, a2, a1 -; ZVBB-RV64-NEXT: add a1, sp, a1 -; ZVBB-RV64-NEXT: addi a1, a1, 64 -; ZVBB-RV64-NEXT: csrr a2, vlenb ; ZVBB-RV64-NEXT: vmv2r.v v16, v8 ; ZVBB-RV64-NEXT: vmv2r.v v22, v16 ; ZVBB-RV64-NEXT: vmv2r.v v24, v18 ; ZVBB-RV64-NEXT: vmv1r.v v26, v20 -; ZVBB-RV64-NEXT: add a3, a0, a2 +; ZVBB-RV64-NEXT: addi a1, sp, 64 +; ZVBB-RV64-NEXT: csrr a2, vlenb +; ZVBB-RV64-NEXT: slli a0, a2, 2 +; ZVBB-RV64-NEXT: add a2, a0, a2 +; ZVBB-RV64-NEXT: add a2, sp, a2 +; ZVBB-RV64-NEXT: addi a2, a2, 64 +; ZVBB-RV64-NEXT: csrr a0, vlenb +; ZVBB-RV64-NEXT: csrr a3, vlenb +; ZVBB-RV64-NEXT: li a4, 10 +; ZVBB-RV64-NEXT: mul a3, a3, a4 +; ZVBB-RV64-NEXT: add a3, sp, a3 +; ZVBB-RV64-NEXT: addi a3, a3, 64 ; ZVBB-RV64-NEXT: vmv1r.v v23, v10 -; ZVBB-RV64-NEXT: add a4, a1, a2 -; ZVBB-RV64-NEXT: add a5, a4, a2 ; ZVBB-RV64-NEXT: vmv1r.v v25, v14 -; ZVBB-RV64-NEXT: add a6, a5, a2 +; ZVBB-RV64-NEXT: vsseg5e8.v v22, (a1) ; ZVBB-RV64-NEXT: vmv1r.v v18, v11 -; ZVBB-RV64-NEXT: vsseg5e8.v v22, (a0) ; ZVBB-RV64-NEXT: vmv1r.v v20, v15 -; ZVBB-RV64-NEXT: vsseg5e8.v v17, (a1) -; ZVBB-RV64-NEXT: vl1r.v v16, (a6) -; ZVBB-RV64-NEXT: add a6, a6, a2 -; ZVBB-RV64-NEXT: vl1r.v v17, (a6) -; ZVBB-RV64-NEXT: add a6, a3, a2 -; ZVBB-RV64-NEXT: vl1r.v v10, (a6) -; ZVBB-RV64-NEXT: add a6, a6, a2 -; ZVBB-RV64-NEXT: vl1r.v v11, (a6) -; ZVBB-RV64-NEXT: vl1r.v v8, (a0) -; ZVBB-RV64-NEXT: vl1r.v v9, (a3) -; ZVBB-RV64-NEXT: vl1r.v v14, (a4) -; ZVBB-RV64-NEXT: csrr a0, vlenb -; ZVBB-RV64-NEXT: li a3, 10 -; ZVBB-RV64-NEXT: mul a0, a0, a3 -; ZVBB-RV64-NEXT: add a0, sp, a0 -; ZVBB-RV64-NEXT: addi a0, a0, 64 -; ZVBB-RV64-NEXT: add a6, a6, a2 -; ZVBB-RV64-NEXT: vl1r.v v15, (a5) -; ZVBB-RV64-NEXT: vl1r.v v12, (a6) -; ZVBB-RV64-NEXT: vl1r.v v13, (a1) -; ZVBB-RV64-NEXT: slli a2, a2, 3 -; ZVBB-RV64-NEXT: add a2, a0, a2 -; ZVBB-RV64-NEXT: vs2r.v v16, (a2) -; ZVBB-RV64-NEXT: vs8r.v v8, (a0) -; ZVBB-RV64-NEXT: vl8r.v v16, (a2) -; ZVBB-RV64-NEXT: vl8r.v v8, (a0) +; ZVBB-RV64-NEXT: vsseg5e8.v v17, (a2) +; ZVBB-RV64-NEXT: add a4, a1, a0 +; ZVBB-RV64-NEXT: vl1r.v v8, (a1) +; ZVBB-RV64-NEXT: add a1, a2, a0 +; ZVBB-RV64-NEXT: vl1r.v v9, (a4) +; ZVBB-RV64-NEXT: add a4, a4, a0 +; ZVBB-RV64-NEXT: vl1r.v v14, (a1) +; ZVBB-RV64-NEXT: add a1, a1, a0 +; ZVBB-RV64-NEXT: vl1r.v v13, (a2) +; ZVBB-RV64-NEXT: vl1r.v v10, (a4) +; ZVBB-RV64-NEXT: add a4, a4, a0 +; ZVBB-RV64-NEXT: vl1r.v v15, (a1) +; ZVBB-RV64-NEXT: add a1, a1, a0 +; ZVBB-RV64-NEXT: vl1r.v v11, (a4) +; ZVBB-RV64-NEXT: add a4, a4, a0 +; ZVBB-RV64-NEXT: vl1r.v v16, (a1) +; ZVBB-RV64-NEXT: add a1, a1, a0 +; ZVBB-RV64-NEXT: vl1r.v v17, (a1) +; ZVBB-RV64-NEXT: vl1r.v v12, (a4) +; ZVBB-RV64-NEXT: slli a0, a0, 3 +; ZVBB-RV64-NEXT: add a0, a3, a0 +; ZVBB-RV64-NEXT: vs2r.v v16, (a0) +; ZVBB-RV64-NEXT: vs8r.v v8, (a3) +; ZVBB-RV64-NEXT: vl8r.v v16, (a0) +; ZVBB-RV64-NEXT: vl8r.v v8, (a3) ; ZVBB-RV64-NEXT: addi sp, s0, -80 ; ZVBB-RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload ; ZVBB-RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload @@ -1514,53 +1514,53 @@ define @vector_interleave_nxv80i8_nxv16i8( ; ZIP-NEXT: andi sp, sp, -64 ; ZIP-NEXT: vsetvli a0, zero, e8, m1, ta, ma ; ZIP-NEXT: vmv2r.v v20, v16 -; ZIP-NEXT: addi a0, sp, 64 ; ZIP-NEXT: vmv2r.v v18, v12 -; ZIP-NEXT: csrr a1, vlenb -; ZIP-NEXT: slli a2, a1, 2 -; ZIP-NEXT: add a1, a2, a1 -; ZIP-NEXT: add a1, sp, a1 -; ZIP-NEXT: addi a1, a1, 64 -; ZIP-NEXT: csrr a2, vlenb ; ZIP-NEXT: vmv2r.v v16, v8 ; ZIP-NEXT: vmv2r.v v22, v16 ; ZIP-NEXT: vmv2r.v v24, v18 ; ZIP-NEXT: vmv1r.v v26, v20 -; ZIP-NEXT: add a3, a0, a2 +; ZIP-NEXT: addi a1, sp, 64 +; ZIP-NEXT: csrr a2, vlenb +; ZIP-NEXT: slli a0, a2, 2 +; ZIP-NEXT: add a2, a0, a2 +; ZIP-NEXT: add a2, sp, a2 +; ZIP-NEXT: addi a2, a2, 64 +; ZIP-NEXT: csrr a0, vlenb +; ZIP-NEXT: csrr a3, vlenb +; ZIP-NEXT: li a4, 10 +; ZIP-NEXT: mul a3, a3, a4 +; ZIP-NEXT: add a3, sp, a3 +; ZIP-NEXT: addi a3, a3, 64 ; ZIP-NEXT: vmv1r.v v23, v10 -; ZIP-NEXT: add a4, a1, a2 -; ZIP-NEXT: add a5, a4, a2 ; ZIP-NEXT: vmv1r.v v25, v14 -; ZIP-NEXT: add a6, a5, a2 +; ZIP-NEXT: vsseg5e8.v v22, (a1) ; ZIP-NEXT: vmv1r.v v18, v11 -; ZIP-NEXT: vsseg5e8.v v22, (a0) ; ZIP-NEXT: vmv1r.v v20, v15 -; ZIP-NEXT: vsseg5e8.v v17, (a1) -; ZIP-NEXT: vl1r.v v16, (a6) -; ZIP-NEXT: add a6, a6, a2 -; ZIP-NEXT: vl1r.v v17, (a6) -; ZIP-NEXT: add a6, a3, a2 -; ZIP-NEXT: vl1r.v v10, (a6) -; ZIP-NEXT: add a6, a6, a2 -; ZIP-NEXT: vl1r.v v11, (a6) -; ZIP-NEXT: vl1r.v v8, (a0) -; ZIP-NEXT: vl1r.v v9, (a3) -; ZIP-NEXT: vl1r.v v14, (a4) -; ZIP-NEXT: csrr a0, vlenb -; ZIP-NEXT: li a3, 10 -; ZIP-NEXT: mul a0, a0, a3 -; ZIP-NEXT: add a0, sp, a0 -; ZIP-NEXT: addi a0, a0, 64 -; ZIP-NEXT: add a6, a6, a2 -; ZIP-NEXT: vl1r.v v15, (a5) -; ZIP-NEXT: vl1r.v v12, (a6) -; ZIP-NEXT: vl1r.v v13, (a1) -; ZIP-NEXT: slli a2, a2, 3 -; ZIP-NEXT: add a2, a0, a2 -; ZIP-NEXT: vs2r.v v16, (a2) -; ZIP-NEXT: vs8r.v v8, (a0) -; ZIP-NEXT: vl8r.v v16, (a2) -; ZIP-NEXT: vl8r.v v8, (a0) +; ZIP-NEXT: vsseg5e8.v v17, (a2) +; ZIP-NEXT: add a4, a1, a0 +; ZIP-NEXT: vl1r.v v8, (a1) +; ZIP-NEXT: add a1, a2, a0 +; ZIP-NEXT: vl1r.v v9, (a4) +; ZIP-NEXT: add a4, a4, a0 +; ZIP-NEXT: vl1r.v v14, (a1) +; ZIP-NEXT: add a1, a1, a0 +; ZIP-NEXT: vl1r.v v13, (a2) +; ZIP-NEXT: vl1r.v v10, (a4) +; ZIP-NEXT: add a4, a4, a0 +; ZIP-NEXT: vl1r.v v15, (a1) +; ZIP-NEXT: add a1, a1, a0 +; ZIP-NEXT: vl1r.v v11, (a4) +; ZIP-NEXT: add a4, a4, a0 +; ZIP-NEXT: vl1r.v v16, (a1) +; ZIP-NEXT: add a1, a1, a0 +; ZIP-NEXT: vl1r.v v17, (a1) +; ZIP-NEXT: vl1r.v v12, (a4) +; ZIP-NEXT: slli a0, a0, 3 +; ZIP-NEXT: add a0, a3, a0 +; ZIP-NEXT: vs2r.v v16, (a0) +; ZIP-NEXT: vs8r.v v8, (a3) +; ZIP-NEXT: vl8r.v v16, (a0) +; ZIP-NEXT: vl8r.v v8, (a3) ; ZIP-NEXT: addi sp, s0, -80 ; ZIP-NEXT: ld ra, 72(sp) # 8-byte Folded Reload ; ZIP-NEXT: ld s0, 64(sp) # 8-byte Folded Reload @@ -1581,15 +1581,15 @@ define @vector_interleave_nxv40i8_nxv8i8( %a ; CHECK-NEXT: sub sp, sp, a0 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma +; CHECK-NEXT: vsseg5e8.v v8, (a0) ; CHECK-NEXT: add a2, a0, a1 ; CHECK-NEXT: add a3, a2, a1 -; CHECK-NEXT: vsetvli a4, zero, e8, m1, ta, ma -; CHECK-NEXT: vsseg5e8.v v8, (a0) +; CHECK-NEXT: vl1r.v v8, (a0) +; CHECK-NEXT: vl1r.v v9, (a2) ; CHECK-NEXT: vl1r.v v10, (a3) ; CHECK-NEXT: add a3, a3, a1 ; CHECK-NEXT: vl1r.v v11, (a3) -; CHECK-NEXT: vl1r.v v8, (a0) -; CHECK-NEXT: vl1r.v v9, (a2) ; CHECK-NEXT: add a1, a3, a1 ; CHECK-NEXT: vl1r.v v12, (a1) ; CHECK-NEXT: csrr a0, vlenb @@ -1608,15 +1608,15 @@ define @vector_interleave_nxv40i8_nxv8i8( %a ; ZVBB-NEXT: sub sp, sp, a0 ; ZVBB-NEXT: addi a0, sp, 16 ; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: vsetvli a2, zero, e8, m1, ta, ma +; ZVBB-NEXT: vsseg5e8.v v8, (a0) ; ZVBB-NEXT: add a2, a0, a1 ; ZVBB-NEXT: add a3, a2, a1 -; ZVBB-NEXT: vsetvli a4, zero, e8, m1, ta, ma -; ZVBB-NEXT: vsseg5e8.v v8, (a0) +; ZVBB-NEXT: vl1r.v v8, (a0) +; ZVBB-NEXT: vl1r.v v9, (a2) ; ZVBB-NEXT: vl1r.v v10, (a3) ; ZVBB-NEXT: add a3, a3, a1 ; ZVBB-NEXT: vl1r.v v11, (a3) -; ZVBB-NEXT: vl1r.v v8, (a0) -; ZVBB-NEXT: vl1r.v v9, (a2) ; ZVBB-NEXT: add a1, a3, a1 ; ZVBB-NEXT: vl1r.v v12, (a1) ; ZVBB-NEXT: csrr a0, vlenb @@ -1645,53 +1645,53 @@ define @vector_interleave_nxv20i32_nxv4i32( @vector_interleave_nxv20i32_nxv4i32( @vector_interleave_nxv20i32_nxv4i32( @vector_interleave_nxv20i32_nxv4i32( @vector_interleave_nxv20i32_nxv4i32( @vector_interleave_nxv10i64_nxv2i64( @vector_interleave_nxv10i64_nxv2i64( @vector_interleave_nxv10i64_nxv2i64( @vector_interleave_nxv10i64_nxv2i64( @vector_interleave_nxv10i64_nxv2i64( @vector_interleave_nxv96i1_nxv16i1( ; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: sub sp, sp, a0 ; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; CHECK-NEXT: vmv.v.i v20, 0 -; CHECK-NEXT: vmerge.vim v14, v20, 1, v0 +; CHECK-NEXT: vmv.v.i v14, 0 +; CHECK-NEXT: csrr a5, vlenb +; CHECK-NEXT: li a0, 6 +; CHECK-NEXT: mul a5, a5, a0 +; CHECK-NEXT: add a5, sp, a5 +; CHECK-NEXT: addi a5, a5, 16 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: vmerge.vim v16, v14, 1, v0 ; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vim v22, v20, 1, v0 +; CHECK-NEXT: vmerge.vim v24, v14, 1, v0 +; CHECK-NEXT: add a4, a5, a2 +; CHECK-NEXT: add a3, a0, a2 +; CHECK-NEXT: srli a1, a2, 1 +; CHECK-NEXT: vmv1r.v v18, v25 ; CHECK-NEXT: vmv1r.v v0, v9 -; CHECK-NEXT: vmv1r.v v16, v23 -; CHECK-NEXT: vmerge.vim v8, v20, 1, v0 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 6 -; CHECK-NEXT: mul a0, a0, a1 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vmv1r.v v17, v9 +; CHECK-NEXT: vmerge.vim v8, v14, 1, v0 +; CHECK-NEXT: add a6, a3, a2 +; CHECK-NEXT: vmv1r.v v19, v9 ; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vmerge.vim v24, v20, 1, v0 -; CHECK-NEXT: addi a4, sp, 16 -; CHECK-NEXT: vmv1r.v v18, v25 +; CHECK-NEXT: vmerge.vim v26, v14, 1, v0 ; CHECK-NEXT: vmv1r.v v0, v11 -; CHECK-NEXT: vmerge.vim v26, v20, 1, v0 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: vmv1r.v v19, v27 +; CHECK-NEXT: vmerge.vim v10, v14, 1, v0 ; CHECK-NEXT: vmv1r.v v0, v12 -; CHECK-NEXT: vmerge.vim v10, v20, 1, v0 -; CHECK-NEXT: add a2, a0, a1 -; CHECK-NEXT: vmv1r.v v20, v11 -; CHECK-NEXT: vsetvli a3, zero, e8, m1, ta, ma -; CHECK-NEXT: vsseg6e8.v v15, (a0) -; CHECK-NEXT: vmv1r.v v15, v22 -; CHECK-NEXT: add a5, a4, a1 -; CHECK-NEXT: vmv1r.v v16, v8 -; CHECK-NEXT: srli a3, a1, 1 +; CHECK-NEXT: vmerge.vim v12, v14, 1, v0 +; CHECK-NEXT: add a7, a6, a2 +; CHECK-NEXT: vmv1r.v v20, v27 +; CHECK-NEXT: vmv1r.v v21, v11 +; CHECK-NEXT: vmv1r.v v22, v13 +; CHECK-NEXT: vsetvli t0, zero, e8, m1, ta, ma +; CHECK-NEXT: vsseg6e8.v v17, (a5) ; CHECK-NEXT: vmv1r.v v17, v24 -; CHECK-NEXT: add a6, a5, a1 -; CHECK-NEXT: vmv1r.v v18, v26 -; CHECK-NEXT: add a7, a2, a1 -; CHECK-NEXT: vmv1r.v v19, v10 -; CHECK-NEXT: vsseg6e8.v v14, (a4) -; CHECK-NEXT: vl1r.v v8, (a0) -; CHECK-NEXT: add a0, a6, a1 -; CHECK-NEXT: vl1r.v v10, (a6) -; CHECK-NEXT: add a6, a7, a1 -; CHECK-NEXT: vl1r.v v12, (a4) -; CHECK-NEXT: add a4, a0, a1 -; CHECK-NEXT: vl1r.v v14, (a7) -; CHECK-NEXT: add a7, a6, a1 -; CHECK-NEXT: vl1r.v v16, (a4) -; CHECK-NEXT: add a4, a4, a1 -; CHECK-NEXT: vl1r.v v18, (a7) -; CHECK-NEXT: add a7, a7, a1 -; CHECK-NEXT: srli a1, a1, 2 -; CHECK-NEXT: vl1r.v v9, (a2) -; CHECK-NEXT: vl1r.v v17, (a4) -; CHECK-NEXT: vl1r.v v11, (a0) -; CHECK-NEXT: vl1r.v v13, (a5) -; CHECK-NEXT: vl1r.v v19, (a7) -; CHECK-NEXT: vl1r.v v15, (a6) +; CHECK-NEXT: vmv1r.v v18, v8 +; CHECK-NEXT: vmv1r.v v19, v26 +; CHECK-NEXT: vmv1r.v v20, v10 +; CHECK-NEXT: vmv1r.v v21, v12 +; CHECK-NEXT: vsseg6e8.v v16, (a0) +; CHECK-NEXT: vl1r.v v8, (a5) +; CHECK-NEXT: add a5, a7, a2 +; CHECK-NEXT: vl1r.v v10, (a5) +; CHECK-NEXT: add a5, a5, a2 +; CHECK-NEXT: vl1r.v v11, (a5) +; CHECK-NEXT: srli a5, a2, 2 +; CHECK-NEXT: vl1r.v v9, (a4) +; CHECK-NEXT: add a4, a4, a2 +; CHECK-NEXT: vl1r.v v12, (a6) +; CHECK-NEXT: add a6, a4, a2 +; CHECK-NEXT: vl1r.v v13, (a7) +; CHECK-NEXT: add a7, a6, a2 +; CHECK-NEXT: add a2, a7, a2 +; CHECK-NEXT: vl1r.v v14, (a0) +; CHECK-NEXT: vl1r.v v15, (a3) +; CHECK-NEXT: vl1r.v v16, (a7) +; CHECK-NEXT: vl1r.v v17, (a2) +; CHECK-NEXT: vl1r.v v18, (a4) +; CHECK-NEXT: vl1r.v v19, (a6) ; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma ; CHECK-NEXT: vmsne.vi v20, v8, 0 -; CHECK-NEXT: vmsne.vi v9, v16, 0 -; CHECK-NEXT: vmsne.vi v16, v10, 0 -; CHECK-NEXT: vmsne.vi v0, v12, 0 -; CHECK-NEXT: vmsne.vi v10, v18, 0 -; CHECK-NEXT: vmsne.vi v8, v14, 0 +; CHECK-NEXT: vmsne.vi v9, v10, 0 +; CHECK-NEXT: vmsne.vi v10, v12, 0 +; CHECK-NEXT: vmsne.vi v0, v14, 0 +; CHECK-NEXT: vmsne.vi v11, v16, 0 +; CHECK-NEXT: vmsne.vi v8, v18, 0 ; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma -; CHECK-NEXT: vslideup.vx v9, v20, a1 -; CHECK-NEXT: vslideup.vx v0, v16, a1 +; CHECK-NEXT: vslideup.vx v9, v20, a5 +; CHECK-NEXT: vslideup.vx v0, v10, a5 ; CHECK-NEXT: vsetvli a0, zero, e8, m1, ta, ma -; CHECK-NEXT: vslideup.vx v0, v9, a3 +; CHECK-NEXT: vslideup.vx v0, v9, a1 ; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma -; CHECK-NEXT: vslideup.vx v8, v10, a1 +; CHECK-NEXT: vslideup.vx v8, v11, a5 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: li a1, 12 ; CHECK-NEXT: mul a0, a0, a1 @@ -2395,76 +2395,76 @@ define @vector_interleave_nxv96i1_nxv16i1( ; ZVBB-NEXT: mul a0, a0, a1 ; ZVBB-NEXT: sub sp, sp, a0 ; ZVBB-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; ZVBB-NEXT: vmv.v.i v20, 0 -; ZVBB-NEXT: vmerge.vim v14, v20, 1, v0 +; ZVBB-NEXT: vmv.v.i v14, 0 +; ZVBB-NEXT: csrr a5, vlenb +; ZVBB-NEXT: li a0, 6 +; ZVBB-NEXT: mul a5, a5, a0 +; ZVBB-NEXT: add a5, sp, a5 +; ZVBB-NEXT: addi a5, a5, 16 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a2, vlenb +; ZVBB-NEXT: vmerge.vim v16, v14, 1, v0 ; ZVBB-NEXT: vmv1r.v v0, v8 -; ZVBB-NEXT: vmerge.vim v22, v20, 1, v0 +; ZVBB-NEXT: vmerge.vim v24, v14, 1, v0 +; ZVBB-NEXT: add a4, a5, a2 +; ZVBB-NEXT: add a3, a0, a2 +; ZVBB-NEXT: srli a1, a2, 1 +; ZVBB-NEXT: vmv1r.v v18, v25 ; ZVBB-NEXT: vmv1r.v v0, v9 -; ZVBB-NEXT: vmv1r.v v16, v23 -; ZVBB-NEXT: vmerge.vim v8, v20, 1, v0 -; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: li a1, 6 -; ZVBB-NEXT: mul a0, a0, a1 -; ZVBB-NEXT: add a0, sp, a0 -; ZVBB-NEXT: addi a0, a0, 16 -; ZVBB-NEXT: vmv1r.v v17, v9 +; ZVBB-NEXT: vmerge.vim v8, v14, 1, v0 +; ZVBB-NEXT: add a6, a3, a2 +; ZVBB-NEXT: vmv1r.v v19, v9 ; ZVBB-NEXT: vmv1r.v v0, v10 -; ZVBB-NEXT: vmerge.vim v24, v20, 1, v0 -; ZVBB-NEXT: addi a4, sp, 16 -; ZVBB-NEXT: vmv1r.v v18, v25 +; ZVBB-NEXT: vmerge.vim v26, v14, 1, v0 ; ZVBB-NEXT: vmv1r.v v0, v11 -; ZVBB-NEXT: vmerge.vim v26, v20, 1, v0 -; ZVBB-NEXT: csrr a1, vlenb -; ZVBB-NEXT: vmv1r.v v19, v27 +; ZVBB-NEXT: vmerge.vim v10, v14, 1, v0 ; ZVBB-NEXT: vmv1r.v v0, v12 -; ZVBB-NEXT: vmerge.vim v10, v20, 1, v0 -; ZVBB-NEXT: add a2, a0, a1 -; ZVBB-NEXT: vmv1r.v v20, v11 -; ZVBB-NEXT: vsetvli a3, zero, e8, m1, ta, ma -; ZVBB-NEXT: vsseg6e8.v v15, (a0) -; ZVBB-NEXT: vmv1r.v v15, v22 -; ZVBB-NEXT: add a5, a4, a1 -; ZVBB-NEXT: vmv1r.v v16, v8 -; ZVBB-NEXT: srli a3, a1, 1 +; ZVBB-NEXT: vmerge.vim v12, v14, 1, v0 +; ZVBB-NEXT: add a7, a6, a2 +; ZVBB-NEXT: vmv1r.v v20, v27 +; ZVBB-NEXT: vmv1r.v v21, v11 +; ZVBB-NEXT: vmv1r.v v22, v13 +; ZVBB-NEXT: vsetvli t0, zero, e8, m1, ta, ma +; ZVBB-NEXT: vsseg6e8.v v17, (a5) ; ZVBB-NEXT: vmv1r.v v17, v24 -; ZVBB-NEXT: add a6, a5, a1 -; ZVBB-NEXT: vmv1r.v v18, v26 -; ZVBB-NEXT: add a7, a2, a1 -; ZVBB-NEXT: vmv1r.v v19, v10 -; ZVBB-NEXT: vsseg6e8.v v14, (a4) -; ZVBB-NEXT: vl1r.v v8, (a0) -; ZVBB-NEXT: add a0, a6, a1 -; ZVBB-NEXT: vl1r.v v10, (a6) -; ZVBB-NEXT: add a6, a7, a1 -; ZVBB-NEXT: vl1r.v v12, (a4) -; ZVBB-NEXT: add a4, a0, a1 -; ZVBB-NEXT: vl1r.v v14, (a7) -; ZVBB-NEXT: add a7, a6, a1 -; ZVBB-NEXT: vl1r.v v16, (a4) -; ZVBB-NEXT: add a4, a4, a1 -; ZVBB-NEXT: vl1r.v v18, (a7) -; ZVBB-NEXT: add a7, a7, a1 -; ZVBB-NEXT: srli a1, a1, 2 -; ZVBB-NEXT: vl1r.v v9, (a2) -; ZVBB-NEXT: vl1r.v v17, (a4) -; ZVBB-NEXT: vl1r.v v11, (a0) -; ZVBB-NEXT: vl1r.v v13, (a5) -; ZVBB-NEXT: vl1r.v v19, (a7) -; ZVBB-NEXT: vl1r.v v15, (a6) +; ZVBB-NEXT: vmv1r.v v18, v8 +; ZVBB-NEXT: vmv1r.v v19, v26 +; ZVBB-NEXT: vmv1r.v v20, v10 +; ZVBB-NEXT: vmv1r.v v21, v12 +; ZVBB-NEXT: vsseg6e8.v v16, (a0) +; ZVBB-NEXT: vl1r.v v8, (a5) +; ZVBB-NEXT: add a5, a7, a2 +; ZVBB-NEXT: vl1r.v v10, (a5) +; ZVBB-NEXT: add a5, a5, a2 +; ZVBB-NEXT: vl1r.v v11, (a5) +; ZVBB-NEXT: srli a5, a2, 2 +; ZVBB-NEXT: vl1r.v v9, (a4) +; ZVBB-NEXT: add a4, a4, a2 +; ZVBB-NEXT: vl1r.v v12, (a6) +; ZVBB-NEXT: add a6, a4, a2 +; ZVBB-NEXT: vl1r.v v13, (a7) +; ZVBB-NEXT: add a7, a6, a2 +; ZVBB-NEXT: add a2, a7, a2 +; ZVBB-NEXT: vl1r.v v14, (a0) +; ZVBB-NEXT: vl1r.v v15, (a3) +; ZVBB-NEXT: vl1r.v v16, (a7) +; ZVBB-NEXT: vl1r.v v17, (a2) +; ZVBB-NEXT: vl1r.v v18, (a4) +; ZVBB-NEXT: vl1r.v v19, (a6) ; ZVBB-NEXT: vsetvli a0, zero, e8, m2, ta, ma ; ZVBB-NEXT: vmsne.vi v20, v8, 0 -; ZVBB-NEXT: vmsne.vi v9, v16, 0 -; ZVBB-NEXT: vmsne.vi v16, v10, 0 -; ZVBB-NEXT: vmsne.vi v0, v12, 0 -; ZVBB-NEXT: vmsne.vi v10, v18, 0 -; ZVBB-NEXT: vmsne.vi v8, v14, 0 +; ZVBB-NEXT: vmsne.vi v9, v10, 0 +; ZVBB-NEXT: vmsne.vi v10, v12, 0 +; ZVBB-NEXT: vmsne.vi v0, v14, 0 +; ZVBB-NEXT: vmsne.vi v11, v16, 0 +; ZVBB-NEXT: vmsne.vi v8, v18, 0 ; ZVBB-NEXT: vsetvli a0, zero, e8, mf2, ta, ma -; ZVBB-NEXT: vslideup.vx v9, v20, a1 -; ZVBB-NEXT: vslideup.vx v0, v16, a1 +; ZVBB-NEXT: vslideup.vx v9, v20, a5 +; ZVBB-NEXT: vslideup.vx v0, v10, a5 ; ZVBB-NEXT: vsetvli a0, zero, e8, m1, ta, ma -; ZVBB-NEXT: vslideup.vx v0, v9, a3 +; ZVBB-NEXT: vslideup.vx v0, v9, a1 ; ZVBB-NEXT: vsetvli a0, zero, e8, mf2, ta, ma -; ZVBB-NEXT: vslideup.vx v8, v10, a1 +; ZVBB-NEXT: vslideup.vx v8, v11, a5 ; ZVBB-NEXT: csrr a0, vlenb ; ZVBB-NEXT: li a1, 12 ; ZVBB-NEXT: mul a0, a0, a1 @@ -2489,61 +2489,59 @@ define @vector_interleave_nxv96i8_nxv16i8( ; RV32-NEXT: sub sp, sp, a0 ; RV32-NEXT: andi sp, sp, -64 ; RV32-NEXT: vsetvli a0, zero, e8, m1, ta, ma -; RV32-NEXT: vmv2r.v v20, v14 -; RV32-NEXT: vmv2r.v v22, v12 -; RV32-NEXT: vmv2r.v v24, v10 +; RV32-NEXT: vmv2r.v v20, v8 +; RV32-NEXT: vmv1r.v v22, v11 +; RV32-NEXT: vmv1r.v v23, v13 +; RV32-NEXT: vmv1r.v v24, v15 +; RV32-NEXT: vmv1r.v v25, v17 +; RV32-NEXT: vmv1r.v v26, v19 ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: li a0, 6 ; RV32-NEXT: mul a1, a1, a0 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 64 -; RV32-NEXT: vmv1r.v v10, v25 -; RV32-NEXT: vmv1r.v v11, v23 -; RV32-NEXT: vmv1r.v v12, v21 -; RV32-NEXT: addi a0, sp, 64 -; RV32-NEXT: vmv1r.v v13, v17 -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: vmv1r.v v14, v19 -; RV32-NEXT: vsseg6e8.v v9, (a1) -; RV32-NEXT: vmv1r.v v9, v24 -; RV32-NEXT: add a5, a1, a2 -; RV32-NEXT: vmv1r.v v10, v22 -; RV32-NEXT: add a3, a0, a2 -; RV32-NEXT: vmv1r.v v11, v20 -; RV32-NEXT: add a4, a3, a2 -; RV32-NEXT: vmv1r.v v12, v16 -; RV32-NEXT: add a6, a5, a2 -; RV32-NEXT: vmv1r.v v13, v18 -; RV32-NEXT: vsseg6e8.v v8, (a0) +; RV32-NEXT: addi a2, sp, 64 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: li a4, 12 +; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 64 +; RV32-NEXT: vsseg6e8.v v21, (a1) +; RV32-NEXT: vmv1r.v v21, v10 +; RV32-NEXT: vmv1r.v v22, v12 +; RV32-NEXT: vmv1r.v v23, v14 +; RV32-NEXT: vmv1r.v v24, v16 +; RV32-NEXT: vmv1r.v v25, v18 +; RV32-NEXT: vsseg6e8.v v20, (a2) +; RV32-NEXT: add a4, a1, a0 +; RV32-NEXT: add a5, a2, a0 ; RV32-NEXT: vl1r.v v14, (a1) -; RV32-NEXT: add a1, a6, a2 -; RV32-NEXT: vl1r.v v15, (a5) -; RV32-NEXT: add a5, a1, a2 -; RV32-NEXT: vl1r.v v18, (a5) -; RV32-NEXT: add a5, a5, a2 -; RV32-NEXT: vl1r.v v19, (a5) -; RV32-NEXT: add a5, a4, a2 -; RV32-NEXT: vl1r.v v16, (a6) -; RV32-NEXT: add a6, a5, a2 -; RV32-NEXT: vl1r.v v12, (a6) -; RV32-NEXT: add a6, a6, a2 -; RV32-NEXT: vl1r.v v13, (a6) -; RV32-NEXT: csrr a6, vlenb -; RV32-NEXT: li a7, 12 -; RV32-NEXT: mul a6, a6, a7 -; RV32-NEXT: add a6, sp, a6 -; RV32-NEXT: addi a6, a6, 64 +; RV32-NEXT: vl1r.v v15, (a4) +; RV32-NEXT: add a1, a5, a0 +; RV32-NEXT: vl1r.v v8, (a2) +; RV32-NEXT: vl1r.v v9, (a5) +; RV32-NEXT: add a4, a4, a0 +; RV32-NEXT: add a2, a1, a0 +; RV32-NEXT: vl1r.v v10, (a1) +; RV32-NEXT: add a1, a4, a0 +; RV32-NEXT: vl1r.v v16, (a4) +; RV32-NEXT: add a4, a2, a0 +; RV32-NEXT: vl1r.v v11, (a2) +; RV32-NEXT: add a2, a1, a0 ; RV32-NEXT: vl1r.v v17, (a1) -; RV32-NEXT: vl1r.v v10, (a4) -; RV32-NEXT: vl1r.v v11, (a5) -; RV32-NEXT: vl1r.v v8, (a0) -; RV32-NEXT: vl1r.v v9, (a3) -; RV32-NEXT: slli a2, a2, 3 -; RV32-NEXT: add a2, a6, a2 -; RV32-NEXT: vs4r.v v16, (a2) -; RV32-NEXT: vs8r.v v8, (a6) -; RV32-NEXT: vl8r.v v16, (a2) -; RV32-NEXT: vl8r.v v8, (a6) +; RV32-NEXT: vl1r.v v12, (a4) +; RV32-NEXT: add a4, a4, a0 +; RV32-NEXT: vl1r.v v18, (a2) +; RV32-NEXT: add a2, a2, a0 +; RV32-NEXT: vl1r.v v19, (a2) +; RV32-NEXT: vl1r.v v13, (a4) +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, a3, a0 +; RV32-NEXT: vs4r.v v16, (a0) +; RV32-NEXT: vs8r.v v8, (a3) +; RV32-NEXT: vl8r.v v16, (a0) +; RV32-NEXT: vl8r.v v8, (a3) ; RV32-NEXT: addi sp, s0, -80 ; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload @@ -2562,61 +2560,59 @@ define @vector_interleave_nxv96i8_nxv16i8( ; RV64-NEXT: sub sp, sp, a0 ; RV64-NEXT: andi sp, sp, -64 ; RV64-NEXT: vsetvli a0, zero, e8, m1, ta, ma -; RV64-NEXT: vmv2r.v v20, v14 -; RV64-NEXT: vmv2r.v v22, v12 -; RV64-NEXT: vmv2r.v v24, v10 +; RV64-NEXT: vmv2r.v v20, v8 +; RV64-NEXT: vmv1r.v v22, v11 +; RV64-NEXT: vmv1r.v v23, v13 +; RV64-NEXT: vmv1r.v v24, v15 +; RV64-NEXT: vmv1r.v v25, v17 +; RV64-NEXT: vmv1r.v v26, v19 ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: li a0, 6 ; RV64-NEXT: mul a1, a1, a0 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 64 -; RV64-NEXT: vmv1r.v v10, v25 -; RV64-NEXT: vmv1r.v v11, v23 -; RV64-NEXT: vmv1r.v v12, v21 -; RV64-NEXT: addi a0, sp, 64 -; RV64-NEXT: vmv1r.v v13, v17 -; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: vmv1r.v v14, v19 -; RV64-NEXT: vsseg6e8.v v9, (a1) -; RV64-NEXT: vmv1r.v v9, v24 -; RV64-NEXT: add a5, a1, a2 -; RV64-NEXT: vmv1r.v v10, v22 -; RV64-NEXT: add a3, a0, a2 -; RV64-NEXT: vmv1r.v v11, v20 -; RV64-NEXT: add a4, a3, a2 -; RV64-NEXT: vmv1r.v v12, v16 -; RV64-NEXT: add a6, a5, a2 -; RV64-NEXT: vmv1r.v v13, v18 -; RV64-NEXT: vsseg6e8.v v8, (a0) +; RV64-NEXT: addi a2, sp, 64 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: li a4, 12 +; RV64-NEXT: mul a3, a3, a4 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 64 +; RV64-NEXT: vsseg6e8.v v21, (a1) +; RV64-NEXT: vmv1r.v v21, v10 +; RV64-NEXT: vmv1r.v v22, v12 +; RV64-NEXT: vmv1r.v v23, v14 +; RV64-NEXT: vmv1r.v v24, v16 +; RV64-NEXT: vmv1r.v v25, v18 +; RV64-NEXT: vsseg6e8.v v20, (a2) +; RV64-NEXT: add a4, a1, a0 +; RV64-NEXT: add a5, a2, a0 ; RV64-NEXT: vl1r.v v14, (a1) -; RV64-NEXT: add a1, a6, a2 -; RV64-NEXT: vl1r.v v15, (a5) -; RV64-NEXT: add a5, a1, a2 -; RV64-NEXT: vl1r.v v18, (a5) -; RV64-NEXT: add a5, a5, a2 -; RV64-NEXT: vl1r.v v19, (a5) -; RV64-NEXT: add a5, a4, a2 -; RV64-NEXT: vl1r.v v16, (a6) -; RV64-NEXT: add a6, a5, a2 -; RV64-NEXT: vl1r.v v12, (a6) -; RV64-NEXT: add a6, a6, a2 -; RV64-NEXT: vl1r.v v13, (a6) -; RV64-NEXT: csrr a6, vlenb -; RV64-NEXT: li a7, 12 -; RV64-NEXT: mul a6, a6, a7 -; RV64-NEXT: add a6, sp, a6 -; RV64-NEXT: addi a6, a6, 64 +; RV64-NEXT: vl1r.v v15, (a4) +; RV64-NEXT: add a1, a5, a0 +; RV64-NEXT: vl1r.v v8, (a2) +; RV64-NEXT: vl1r.v v9, (a5) +; RV64-NEXT: add a4, a4, a0 +; RV64-NEXT: add a2, a1, a0 +; RV64-NEXT: vl1r.v v10, (a1) +; RV64-NEXT: add a1, a4, a0 +; RV64-NEXT: vl1r.v v16, (a4) +; RV64-NEXT: add a4, a2, a0 +; RV64-NEXT: vl1r.v v11, (a2) +; RV64-NEXT: add a2, a1, a0 ; RV64-NEXT: vl1r.v v17, (a1) -; RV64-NEXT: vl1r.v v10, (a4) -; RV64-NEXT: vl1r.v v11, (a5) -; RV64-NEXT: vl1r.v v8, (a0) -; RV64-NEXT: vl1r.v v9, (a3) -; RV64-NEXT: slli a2, a2, 3 -; RV64-NEXT: add a2, a6, a2 -; RV64-NEXT: vs4r.v v16, (a2) -; RV64-NEXT: vs8r.v v8, (a6) -; RV64-NEXT: vl8r.v v16, (a2) -; RV64-NEXT: vl8r.v v8, (a6) +; RV64-NEXT: vl1r.v v12, (a4) +; RV64-NEXT: add a4, a4, a0 +; RV64-NEXT: vl1r.v v18, (a2) +; RV64-NEXT: add a2, a2, a0 +; RV64-NEXT: vl1r.v v19, (a2) +; RV64-NEXT: vl1r.v v13, (a4) +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, a3, a0 +; RV64-NEXT: vs4r.v v16, (a0) +; RV64-NEXT: vs8r.v v8, (a3) +; RV64-NEXT: vl8r.v v16, (a0) +; RV64-NEXT: vl8r.v v8, (a3) ; RV64-NEXT: addi sp, s0, -80 ; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload ; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload @@ -2635,61 +2631,59 @@ define @vector_interleave_nxv96i8_nxv16i8( ; ZVBB-RV32-NEXT: sub sp, sp, a0 ; ZVBB-RV32-NEXT: andi sp, sp, -64 ; ZVBB-RV32-NEXT: vsetvli a0, zero, e8, m1, ta, ma -; ZVBB-RV32-NEXT: vmv2r.v v20, v14 -; ZVBB-RV32-NEXT: vmv2r.v v22, v12 -; ZVBB-RV32-NEXT: vmv2r.v v24, v10 +; ZVBB-RV32-NEXT: vmv2r.v v20, v8 +; ZVBB-RV32-NEXT: vmv1r.v v22, v11 +; ZVBB-RV32-NEXT: vmv1r.v v23, v13 +; ZVBB-RV32-NEXT: vmv1r.v v24, v15 +; ZVBB-RV32-NEXT: vmv1r.v v25, v17 +; ZVBB-RV32-NEXT: vmv1r.v v26, v19 ; ZVBB-RV32-NEXT: csrr a1, vlenb ; ZVBB-RV32-NEXT: li a0, 6 ; ZVBB-RV32-NEXT: mul a1, a1, a0 ; ZVBB-RV32-NEXT: add a1, sp, a1 ; ZVBB-RV32-NEXT: addi a1, a1, 64 -; ZVBB-RV32-NEXT: vmv1r.v v10, v25 -; ZVBB-RV32-NEXT: vmv1r.v v11, v23 -; ZVBB-RV32-NEXT: vmv1r.v v12, v21 -; ZVBB-RV32-NEXT: addi a0, sp, 64 -; ZVBB-RV32-NEXT: vmv1r.v v13, v17 -; ZVBB-RV32-NEXT: csrr a2, vlenb -; ZVBB-RV32-NEXT: vmv1r.v v14, v19 -; ZVBB-RV32-NEXT: vsseg6e8.v v9, (a1) -; ZVBB-RV32-NEXT: vmv1r.v v9, v24 -; ZVBB-RV32-NEXT: add a5, a1, a2 -; ZVBB-RV32-NEXT: vmv1r.v v10, v22 -; ZVBB-RV32-NEXT: add a3, a0, a2 -; ZVBB-RV32-NEXT: vmv1r.v v11, v20 -; ZVBB-RV32-NEXT: add a4, a3, a2 -; ZVBB-RV32-NEXT: vmv1r.v v12, v16 -; ZVBB-RV32-NEXT: add a6, a5, a2 -; ZVBB-RV32-NEXT: vmv1r.v v13, v18 -; ZVBB-RV32-NEXT: vsseg6e8.v v8, (a0) +; ZVBB-RV32-NEXT: addi a2, sp, 64 +; ZVBB-RV32-NEXT: csrr a0, vlenb +; ZVBB-RV32-NEXT: csrr a3, vlenb +; ZVBB-RV32-NEXT: li a4, 12 +; ZVBB-RV32-NEXT: mul a3, a3, a4 +; ZVBB-RV32-NEXT: add a3, sp, a3 +; ZVBB-RV32-NEXT: addi a3, a3, 64 +; ZVBB-RV32-NEXT: vsseg6e8.v v21, (a1) +; ZVBB-RV32-NEXT: vmv1r.v v21, v10 +; ZVBB-RV32-NEXT: vmv1r.v v22, v12 +; ZVBB-RV32-NEXT: vmv1r.v v23, v14 +; ZVBB-RV32-NEXT: vmv1r.v v24, v16 +; ZVBB-RV32-NEXT: vmv1r.v v25, v18 +; ZVBB-RV32-NEXT: vsseg6e8.v v20, (a2) +; ZVBB-RV32-NEXT: add a4, a1, a0 +; ZVBB-RV32-NEXT: add a5, a2, a0 ; ZVBB-RV32-NEXT: vl1r.v v14, (a1) -; ZVBB-RV32-NEXT: add a1, a6, a2 -; ZVBB-RV32-NEXT: vl1r.v v15, (a5) -; ZVBB-RV32-NEXT: add a5, a1, a2 -; ZVBB-RV32-NEXT: vl1r.v v18, (a5) -; ZVBB-RV32-NEXT: add a5, a5, a2 -; ZVBB-RV32-NEXT: vl1r.v v19, (a5) -; ZVBB-RV32-NEXT: add a5, a4, a2 -; ZVBB-RV32-NEXT: vl1r.v v16, (a6) -; ZVBB-RV32-NEXT: add a6, a5, a2 -; ZVBB-RV32-NEXT: vl1r.v v12, (a6) -; ZVBB-RV32-NEXT: add a6, a6, a2 -; ZVBB-RV32-NEXT: vl1r.v v13, (a6) -; ZVBB-RV32-NEXT: csrr a6, vlenb -; ZVBB-RV32-NEXT: li a7, 12 -; ZVBB-RV32-NEXT: mul a6, a6, a7 -; ZVBB-RV32-NEXT: add a6, sp, a6 -; ZVBB-RV32-NEXT: addi a6, a6, 64 +; ZVBB-RV32-NEXT: vl1r.v v15, (a4) +; ZVBB-RV32-NEXT: add a1, a5, a0 +; ZVBB-RV32-NEXT: vl1r.v v8, (a2) +; ZVBB-RV32-NEXT: vl1r.v v9, (a5) +; ZVBB-RV32-NEXT: add a4, a4, a0 +; ZVBB-RV32-NEXT: add a2, a1, a0 +; ZVBB-RV32-NEXT: vl1r.v v10, (a1) +; ZVBB-RV32-NEXT: add a1, a4, a0 +; ZVBB-RV32-NEXT: vl1r.v v16, (a4) +; ZVBB-RV32-NEXT: add a4, a2, a0 +; ZVBB-RV32-NEXT: vl1r.v v11, (a2) +; ZVBB-RV32-NEXT: add a2, a1, a0 ; ZVBB-RV32-NEXT: vl1r.v v17, (a1) -; ZVBB-RV32-NEXT: vl1r.v v10, (a4) -; ZVBB-RV32-NEXT: vl1r.v v11, (a5) -; ZVBB-RV32-NEXT: vl1r.v v8, (a0) -; ZVBB-RV32-NEXT: vl1r.v v9, (a3) -; ZVBB-RV32-NEXT: slli a2, a2, 3 -; ZVBB-RV32-NEXT: add a2, a6, a2 -; ZVBB-RV32-NEXT: vs4r.v v16, (a2) -; ZVBB-RV32-NEXT: vs8r.v v8, (a6) -; ZVBB-RV32-NEXT: vl8r.v v16, (a2) -; ZVBB-RV32-NEXT: vl8r.v v8, (a6) +; ZVBB-RV32-NEXT: vl1r.v v12, (a4) +; ZVBB-RV32-NEXT: add a4, a4, a0 +; ZVBB-RV32-NEXT: vl1r.v v18, (a2) +; ZVBB-RV32-NEXT: add a2, a2, a0 +; ZVBB-RV32-NEXT: vl1r.v v19, (a2) +; ZVBB-RV32-NEXT: vl1r.v v13, (a4) +; ZVBB-RV32-NEXT: slli a0, a0, 3 +; ZVBB-RV32-NEXT: add a0, a3, a0 +; ZVBB-RV32-NEXT: vs4r.v v16, (a0) +; ZVBB-RV32-NEXT: vs8r.v v8, (a3) +; ZVBB-RV32-NEXT: vl8r.v v16, (a0) +; ZVBB-RV32-NEXT: vl8r.v v8, (a3) ; ZVBB-RV32-NEXT: addi sp, s0, -80 ; ZVBB-RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload ; ZVBB-RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload @@ -2708,61 +2702,59 @@ define @vector_interleave_nxv96i8_nxv16i8( ; ZVBB-RV64-NEXT: sub sp, sp, a0 ; ZVBB-RV64-NEXT: andi sp, sp, -64 ; ZVBB-RV64-NEXT: vsetvli a0, zero, e8, m1, ta, ma -; ZVBB-RV64-NEXT: vmv2r.v v20, v14 -; ZVBB-RV64-NEXT: vmv2r.v v22, v12 -; ZVBB-RV64-NEXT: vmv2r.v v24, v10 +; ZVBB-RV64-NEXT: vmv2r.v v20, v8 +; ZVBB-RV64-NEXT: vmv1r.v v22, v11 +; ZVBB-RV64-NEXT: vmv1r.v v23, v13 +; ZVBB-RV64-NEXT: vmv1r.v v24, v15 +; ZVBB-RV64-NEXT: vmv1r.v v25, v17 +; ZVBB-RV64-NEXT: vmv1r.v v26, v19 ; ZVBB-RV64-NEXT: csrr a1, vlenb ; ZVBB-RV64-NEXT: li a0, 6 ; ZVBB-RV64-NEXT: mul a1, a1, a0 ; ZVBB-RV64-NEXT: add a1, sp, a1 ; ZVBB-RV64-NEXT: addi a1, a1, 64 -; ZVBB-RV64-NEXT: vmv1r.v v10, v25 -; ZVBB-RV64-NEXT: vmv1r.v v11, v23 -; ZVBB-RV64-NEXT: vmv1r.v v12, v21 -; ZVBB-RV64-NEXT: addi a0, sp, 64 -; ZVBB-RV64-NEXT: vmv1r.v v13, v17 -; ZVBB-RV64-NEXT: csrr a2, vlenb -; ZVBB-RV64-NEXT: vmv1r.v v14, v19 -; ZVBB-RV64-NEXT: vsseg6e8.v v9, (a1) -; ZVBB-RV64-NEXT: vmv1r.v v9, v24 -; ZVBB-RV64-NEXT: add a5, a1, a2 -; ZVBB-RV64-NEXT: vmv1r.v v10, v22 -; ZVBB-RV64-NEXT: add a3, a0, a2 -; ZVBB-RV64-NEXT: vmv1r.v v11, v20 -; ZVBB-RV64-NEXT: add a4, a3, a2 -; ZVBB-RV64-NEXT: vmv1r.v v12, v16 -; ZVBB-RV64-NEXT: add a6, a5, a2 -; ZVBB-RV64-NEXT: vmv1r.v v13, v18 -; ZVBB-RV64-NEXT: vsseg6e8.v v8, (a0) +; ZVBB-RV64-NEXT: addi a2, sp, 64 +; ZVBB-RV64-NEXT: csrr a0, vlenb +; ZVBB-RV64-NEXT: csrr a3, vlenb +; ZVBB-RV64-NEXT: li a4, 12 +; ZVBB-RV64-NEXT: mul a3, a3, a4 +; ZVBB-RV64-NEXT: add a3, sp, a3 +; ZVBB-RV64-NEXT: addi a3, a3, 64 +; ZVBB-RV64-NEXT: vsseg6e8.v v21, (a1) +; ZVBB-RV64-NEXT: vmv1r.v v21, v10 +; ZVBB-RV64-NEXT: vmv1r.v v22, v12 +; ZVBB-RV64-NEXT: vmv1r.v v23, v14 +; ZVBB-RV64-NEXT: vmv1r.v v24, v16 +; ZVBB-RV64-NEXT: vmv1r.v v25, v18 +; ZVBB-RV64-NEXT: vsseg6e8.v v20, (a2) +; ZVBB-RV64-NEXT: add a4, a1, a0 +; ZVBB-RV64-NEXT: add a5, a2, a0 ; ZVBB-RV64-NEXT: vl1r.v v14, (a1) -; ZVBB-RV64-NEXT: add a1, a6, a2 -; ZVBB-RV64-NEXT: vl1r.v v15, (a5) -; ZVBB-RV64-NEXT: add a5, a1, a2 -; ZVBB-RV64-NEXT: vl1r.v v18, (a5) -; ZVBB-RV64-NEXT: add a5, a5, a2 -; ZVBB-RV64-NEXT: vl1r.v v19, (a5) -; ZVBB-RV64-NEXT: add a5, a4, a2 -; ZVBB-RV64-NEXT: vl1r.v v16, (a6) -; ZVBB-RV64-NEXT: add a6, a5, a2 -; ZVBB-RV64-NEXT: vl1r.v v12, (a6) -; ZVBB-RV64-NEXT: add a6, a6, a2 -; ZVBB-RV64-NEXT: vl1r.v v13, (a6) -; ZVBB-RV64-NEXT: csrr a6, vlenb -; ZVBB-RV64-NEXT: li a7, 12 -; ZVBB-RV64-NEXT: mul a6, a6, a7 -; ZVBB-RV64-NEXT: add a6, sp, a6 -; ZVBB-RV64-NEXT: addi a6, a6, 64 +; ZVBB-RV64-NEXT: vl1r.v v15, (a4) +; ZVBB-RV64-NEXT: add a1, a5, a0 +; ZVBB-RV64-NEXT: vl1r.v v8, (a2) +; ZVBB-RV64-NEXT: vl1r.v v9, (a5) +; ZVBB-RV64-NEXT: add a4, a4, a0 +; ZVBB-RV64-NEXT: add a2, a1, a0 +; ZVBB-RV64-NEXT: vl1r.v v10, (a1) +; ZVBB-RV64-NEXT: add a1, a4, a0 +; ZVBB-RV64-NEXT: vl1r.v v16, (a4) +; ZVBB-RV64-NEXT: add a4, a2, a0 +; ZVBB-RV64-NEXT: vl1r.v v11, (a2) +; ZVBB-RV64-NEXT: add a2, a1, a0 ; ZVBB-RV64-NEXT: vl1r.v v17, (a1) -; ZVBB-RV64-NEXT: vl1r.v v10, (a4) -; ZVBB-RV64-NEXT: vl1r.v v11, (a5) -; ZVBB-RV64-NEXT: vl1r.v v8, (a0) -; ZVBB-RV64-NEXT: vl1r.v v9, (a3) -; ZVBB-RV64-NEXT: slli a2, a2, 3 -; ZVBB-RV64-NEXT: add a2, a6, a2 -; ZVBB-RV64-NEXT: vs4r.v v16, (a2) -; ZVBB-RV64-NEXT: vs8r.v v8, (a6) -; ZVBB-RV64-NEXT: vl8r.v v16, (a2) -; ZVBB-RV64-NEXT: vl8r.v v8, (a6) +; ZVBB-RV64-NEXT: vl1r.v v12, (a4) +; ZVBB-RV64-NEXT: add a4, a4, a0 +; ZVBB-RV64-NEXT: vl1r.v v18, (a2) +; ZVBB-RV64-NEXT: add a2, a2, a0 +; ZVBB-RV64-NEXT: vl1r.v v19, (a2) +; ZVBB-RV64-NEXT: vl1r.v v13, (a4) +; ZVBB-RV64-NEXT: slli a0, a0, 3 +; ZVBB-RV64-NEXT: add a0, a3, a0 +; ZVBB-RV64-NEXT: vs4r.v v16, (a0) +; ZVBB-RV64-NEXT: vs8r.v v8, (a3) +; ZVBB-RV64-NEXT: vl8r.v v16, (a0) +; ZVBB-RV64-NEXT: vl8r.v v8, (a3) ; ZVBB-RV64-NEXT: addi sp, s0, -80 ; ZVBB-RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload ; ZVBB-RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload @@ -2781,61 +2773,59 @@ define @vector_interleave_nxv96i8_nxv16i8( ; ZIP-NEXT: sub sp, sp, a0 ; ZIP-NEXT: andi sp, sp, -64 ; ZIP-NEXT: vsetvli a0, zero, e8, m1, ta, ma -; ZIP-NEXT: vmv2r.v v20, v14 -; ZIP-NEXT: vmv2r.v v22, v12 -; ZIP-NEXT: vmv2r.v v24, v10 -; ZIP-NEXT: csrr a1, vlenb -; ZIP-NEXT: li a0, 6 -; ZIP-NEXT: mul a1, a1, a0 +; ZIP-NEXT: vmv2r.v v20, v8 +; ZIP-NEXT: vmv1r.v v22, v11 +; ZIP-NEXT: vmv1r.v v23, v13 +; ZIP-NEXT: vmv1r.v v24, v15 +; ZIP-NEXT: vmv1r.v v25, v17 +; ZIP-NEXT: vmv1r.v v26, v19 +; ZIP-NEXT: csrr a1, vlenb +; ZIP-NEXT: li a0, 6 +; ZIP-NEXT: mul a1, a1, a0 ; ZIP-NEXT: add a1, sp, a1 ; ZIP-NEXT: addi a1, a1, 64 -; ZIP-NEXT: vmv1r.v v10, v25 -; ZIP-NEXT: vmv1r.v v11, v23 -; ZIP-NEXT: vmv1r.v v12, v21 -; ZIP-NEXT: addi a0, sp, 64 -; ZIP-NEXT: vmv1r.v v13, v17 -; ZIP-NEXT: csrr a2, vlenb -; ZIP-NEXT: vmv1r.v v14, v19 -; ZIP-NEXT: vsseg6e8.v v9, (a1) -; ZIP-NEXT: vmv1r.v v9, v24 -; ZIP-NEXT: add a5, a1, a2 -; ZIP-NEXT: vmv1r.v v10, v22 -; ZIP-NEXT: add a3, a0, a2 -; ZIP-NEXT: vmv1r.v v11, v20 -; ZIP-NEXT: add a4, a3, a2 -; ZIP-NEXT: vmv1r.v v12, v16 -; ZIP-NEXT: add a6, a5, a2 -; ZIP-NEXT: vmv1r.v v13, v18 -; ZIP-NEXT: vsseg6e8.v v8, (a0) +; ZIP-NEXT: addi a2, sp, 64 +; ZIP-NEXT: csrr a0, vlenb +; ZIP-NEXT: csrr a3, vlenb +; ZIP-NEXT: li a4, 12 +; ZIP-NEXT: mul a3, a3, a4 +; ZIP-NEXT: add a3, sp, a3 +; ZIP-NEXT: addi a3, a3, 64 +; ZIP-NEXT: vsseg6e8.v v21, (a1) +; ZIP-NEXT: vmv1r.v v21, v10 +; ZIP-NEXT: vmv1r.v v22, v12 +; ZIP-NEXT: vmv1r.v v23, v14 +; ZIP-NEXT: vmv1r.v v24, v16 +; ZIP-NEXT: vmv1r.v v25, v18 +; ZIP-NEXT: vsseg6e8.v v20, (a2) +; ZIP-NEXT: add a4, a1, a0 +; ZIP-NEXT: add a5, a2, a0 ; ZIP-NEXT: vl1r.v v14, (a1) -; ZIP-NEXT: add a1, a6, a2 -; ZIP-NEXT: vl1r.v v15, (a5) -; ZIP-NEXT: add a5, a1, a2 -; ZIP-NEXT: vl1r.v v18, (a5) -; ZIP-NEXT: add a5, a5, a2 -; ZIP-NEXT: vl1r.v v19, (a5) -; ZIP-NEXT: add a5, a4, a2 -; ZIP-NEXT: vl1r.v v16, (a6) -; ZIP-NEXT: add a6, a5, a2 -; ZIP-NEXT: vl1r.v v12, (a6) -; ZIP-NEXT: add a6, a6, a2 -; ZIP-NEXT: vl1r.v v13, (a6) -; ZIP-NEXT: csrr a6, vlenb -; ZIP-NEXT: li a7, 12 -; ZIP-NEXT: mul a6, a6, a7 -; ZIP-NEXT: add a6, sp, a6 -; ZIP-NEXT: addi a6, a6, 64 +; ZIP-NEXT: vl1r.v v15, (a4) +; ZIP-NEXT: add a1, a5, a0 +; ZIP-NEXT: vl1r.v v8, (a2) +; ZIP-NEXT: vl1r.v v9, (a5) +; ZIP-NEXT: add a4, a4, a0 +; ZIP-NEXT: add a2, a1, a0 +; ZIP-NEXT: vl1r.v v10, (a1) +; ZIP-NEXT: add a1, a4, a0 +; ZIP-NEXT: vl1r.v v16, (a4) +; ZIP-NEXT: add a4, a2, a0 +; ZIP-NEXT: vl1r.v v11, (a2) +; ZIP-NEXT: add a2, a1, a0 ; ZIP-NEXT: vl1r.v v17, (a1) -; ZIP-NEXT: vl1r.v v10, (a4) -; ZIP-NEXT: vl1r.v v11, (a5) -; ZIP-NEXT: vl1r.v v8, (a0) -; ZIP-NEXT: vl1r.v v9, (a3) -; ZIP-NEXT: slli a2, a2, 3 -; ZIP-NEXT: add a2, a6, a2 -; ZIP-NEXT: vs4r.v v16, (a2) -; ZIP-NEXT: vs8r.v v8, (a6) -; ZIP-NEXT: vl8r.v v16, (a2) -; ZIP-NEXT: vl8r.v v8, (a6) +; ZIP-NEXT: vl1r.v v12, (a4) +; ZIP-NEXT: add a4, a4, a0 +; ZIP-NEXT: vl1r.v v18, (a2) +; ZIP-NEXT: add a2, a2, a0 +; ZIP-NEXT: vl1r.v v19, (a2) +; ZIP-NEXT: vl1r.v v13, (a4) +; ZIP-NEXT: slli a0, a0, 3 +; ZIP-NEXT: add a0, a3, a0 +; ZIP-NEXT: vs4r.v v16, (a0) +; ZIP-NEXT: vs8r.v v8, (a3) +; ZIP-NEXT: vl8r.v v16, (a0) +; ZIP-NEXT: vl8r.v v8, (a3) ; ZIP-NEXT: addi sp, s0, -80 ; ZIP-NEXT: ld ra, 72(sp) # 8-byte Folded Reload ; ZIP-NEXT: ld s0, 64(sp) # 8-byte Folded Reload @@ -2855,16 +2845,16 @@ define @vector_interleave_nxv48i8_nxv8i8( %a ; CHECK-NEXT: sub sp, sp, a0 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma +; CHECK-NEXT: vsseg6e8.v v8, (a0) ; CHECK-NEXT: add a2, a0, a1 ; CHECK-NEXT: add a3, a2, a1 -; CHECK-NEXT: vsetvli a4, zero, e8, m1, ta, ma -; CHECK-NEXT: vsseg6e8.v v8, (a0) +; CHECK-NEXT: vl1r.v v8, (a0) +; CHECK-NEXT: vl1r.v v9, (a2) ; CHECK-NEXT: vl1r.v v10, (a3) ; CHECK-NEXT: add a3, a3, a1 ; CHECK-NEXT: vl1r.v v11, (a3) ; CHECK-NEXT: add a3, a3, a1 -; CHECK-NEXT: vl1r.v v8, (a0) -; CHECK-NEXT: vl1r.v v9, (a2) ; CHECK-NEXT: vl1r.v v12, (a3) ; CHECK-NEXT: add a1, a3, a1 ; CHECK-NEXT: vl1r.v v13, (a1) @@ -2884,16 +2874,16 @@ define @vector_interleave_nxv48i8_nxv8i8( %a ; ZVBB-NEXT: sub sp, sp, a0 ; ZVBB-NEXT: addi a0, sp, 16 ; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: vsetvli a2, zero, e8, m1, ta, ma +; ZVBB-NEXT: vsseg6e8.v v8, (a0) ; ZVBB-NEXT: add a2, a0, a1 ; ZVBB-NEXT: add a3, a2, a1 -; ZVBB-NEXT: vsetvli a4, zero, e8, m1, ta, ma -; ZVBB-NEXT: vsseg6e8.v v8, (a0) +; ZVBB-NEXT: vl1r.v v8, (a0) +; ZVBB-NEXT: vl1r.v v9, (a2) ; ZVBB-NEXT: vl1r.v v10, (a3) ; ZVBB-NEXT: add a3, a3, a1 ; ZVBB-NEXT: vl1r.v v11, (a3) ; ZVBB-NEXT: add a3, a3, a1 -; ZVBB-NEXT: vl1r.v v8, (a0) -; ZVBB-NEXT: vl1r.v v9, (a2) ; ZVBB-NEXT: vl1r.v v12, (a3) ; ZVBB-NEXT: add a1, a3, a1 ; ZVBB-NEXT: vl1r.v v13, (a1) @@ -2921,61 +2911,59 @@ define @vector_interleave_nxv24i32_nxv4i32( @vector_interleave_nxv24i32_nxv4i32( @vector_interleave_nxv24i32_nxv4i32( @vector_interleave_nxv24i32_nxv4i32( @vector_interleave_nxv24i32_nxv4i32( @vector_interleave_nxv12i64_nxv2i64( @vector_interleave_nxv12i64_nxv2i64( @vector_interleave_nxv12i64_nxv2i64( @vector_interleave_nxv12i64_nxv2i64( @vector_interleave_nxv12i64_nxv2i64( @vector_interleave_nxv112i1_nxv16i1( @vector_interleave_nxv112i1_nxv16i1( @vector_interleave_nxv112i8_nxv16i8( @vector_interleave_nxv112i8_nxv16i8( @vector_interleave_nxv112i8_nxv16i8( @vector_interleave_nxv112i8_nxv16i8( @vector_interleave_nxv112i8_nxv16i8( @vector_interleave_nxv56i16_nxv8i16( @vector_interleave_nxv56i16_nxv8i16( @vector_interleave_nxv56i16_nxv8i16( @vector_interleave_nxv56i16_nxv8i16( @vector_interleave_nxv56i16_nxv8i16( @vector_interleave_nxv28i32_nxv4i32( @vector_interleave_nxv28i32_nxv4i32( @vector_interleave_nxv28i32_nxv4i32( @vector_interleave_nxv28i32_nxv4i32( @vector_interleave_nxv28i32_nxv4i32( @vector_interleave_nxv14i64_nxv2i64( @vector_interleave_nxv14i64_nxv2i64( @vector_interleave_nxv14i64_nxv2i64( @vector_interleave_nxv14i64_nxv2i64( @vector_interleave_nxv14i64_nxv2i64( @vector_interleave_nxv128i1_nxv16i1( @vector_interleave_nxv128i1_nxv16i1( @vector_interleave_nxv128i8_nxv16i8( @vector_interleave_nxv128i8_nxv16i8( @vector_interleave_nxv64i16_nxv8i16( @vector_interleave_nxv64i16_nxv8i16( @vector_interleave_nxv32i32_nxv4i32( @vector_interleave_nxv32i32_nxv4i32( @vector_interleave_nxv16i64_nxv2i64( @vector_interleave_nxv16i64_nxv2i64( @vector_interleave_nxv64bf16_nxv32bf16( @vector_interleave_nxv64f16_nxv32f16( @vector_interleave_nxv32f32_nxv16f32( @vector_interleave_nxv16f64_nxv8f64( @vector_interleave_nxv16f64_nxv8f64( @vector_interleave_nxv16f64_nxv8f64( @vector_interleave_nxv6f16_nxv2f16( @vector_interleave_nxv6f16_nxv2f16( @vector_interleave_nxv24f16_nxv8f16( @vector_interleave_nxv24f16_nxv8f16( @vector_interleave_nxv6bf16_nxv2bf16( @vector_interleave_nxv6bf16_nxv2bf16( @vector_interleave_nxv24bf16_nxv8bf16( @vector_interleave_nxv24bf16_nxv8bf16( @vector_interleave_nxv3f32_nxv1f32( @vector_interleave_nxv3f32_nxv1f32( @vector_interleave_nxv12f32_nxv4f32( @vector_interleave_nxv12f32_nxv4f32( @vector_interleave_nxv6f64_nxv2f64( @vector_interleave_nxv6f64_nxv2f64( @vector_interleave_nxv8f16_nxv2f16( @vector_interleave_nxv8f16_nxv2f16( @vector_interleave_nxv16f16_nxv4f16( @vector_interleave_nxv16f16_nxv4f16( @vector_interleave_nxv32f16_nxv8f16( @vector_interleave_nxv32f16_nxv8f16( @vector_interleave_nxv8bf16_nxv2bf16( @vector_interleave_nxv8bf16_nxv2bf16( @vector_interleave_nxv16bf16_nxv4bf16( @vector_interleave_nxv16bf16_nxv4bf16( @vector_interleave_nxv32bf16_nxv8bf16( @vector_interleave_nxv32bf16_nxv8bf16( @vector_interleave_nxv4f32_nxv1f32( @vector_interleave_nxv4f32_nxv1f32( @vector_interleave_nxv8f32_nxv2f32( @vector_interleave_nxv8f32_nxv2f32( @vector_interleave_nxv16f32_nxv4f32( @vector_interleave_nxv16f32_nxv4f32( @vector_interleave_nxv4f64_nxv1f64( @vector_interleave_nxv4f64_nxv1f64( @vector_interleave_nxv8f64_nxv2f64( @vector_interleave_nxv8f64_nxv2f64( @vector_interleave_nxv10f16_nxv2f16( @vector_interleave_nxv10f16_nxv2f16( @vector_interleave_nxv20f16_nxv4f16( @vector_interleave_nxv20f16_nxv4f16( @vector_interleave_nxv40f16_nxv8f16( @vector_interleave_nxv40f16_nxv8f16( @vector_interleave_nxv40f16_nxv8f16( @vector_interleave_nxv40f16_nxv8f16( @vector_interleave_nxv40f16_nxv8f16( @vector_interleave_nxv10bf16_nxv2bf16( @vector_interleave_nxv10bf16_nxv2bf16( @vector_interleave_nxv20bf16_nxv4bf16( @vector_interleave_nxv20bf16_nxv4bf16( @vector_interleave_nxv40bf16_nxv8bf16( @vector_interleave_nxv40bf16_nxv8bf16( @vector_interleave_nxv40bf16_nxv8bf16( @vector_interleave_nxv40bf16_nxv8bf16( @vector_interleave_nxv40bf16_nxv8bf16( @vector_interleave_nxv5f32_nxv1f32( @vector_interleave_nxv5f32_nxv1f32( @vector_interleave_nxv10f32_nxv2f32( @vector_interleave_nxv10f32_nxv2f32( @vector_interleave_nxv20f32_nxv4f32( @vector_interleave_nxv20f32_nxv4f32( @vector_interleave_nxv20f32_nxv4f32( @vector_interleave_nxv20f32_nxv4f32( @vector_interleave_nxv20f32_nxv4f32( @vector_interleave_nxv5f64_nxv1f64( @vector_interleave_nxv5f64_nxv1f64( @vector_interleave_nxv10f64_nxv2f64( @vector_interleave_nxv10f64_nxv2f64( @vector_interleave_nxv10f64_nxv2f64( @vector_interleave_nxv10f64_nxv2f64( @vector_interleave_nxv10f64_nxv2f64( @vector_interleave_nxv12f16_nxv2f16( @vector_interleave_nxv12f16_nxv2f16( @vector_interleave_nxv24f16_nxv4f16( @vector_interleave_nxv24f16_nxv4f16( @vector_interleave_nxv48f16_nxv8f16( @vector_interleave_nxv48f16_nxv8f16( @vector_interleave_nxv48f16_nxv8f16( @vector_interleave_nxv48f16_nxv8f16( @vector_interleave_nxv48f16_nxv8f16( @vector_interleave_nxv12bf16_nxv2bf16( @vector_interleave_nxv12bf16_nxv2bf16( @vector_interleave_nxv24bf16_nxv4bf16( @vector_interleave_nxv24bf16_nxv4bf16( @vector_interleave_nxv48bf16_nxv8bf16( @vector_interleave_nxv48bf16_nxv8bf16( @vector_interleave_nxv48bf16_nxv8bf16( @vector_interleave_nxv48bf16_nxv8bf16( @vector_interleave_nxv48bf16_nxv8bf16( @vector_interleave_nxv6f32_nxv1f32( @vector_interleave_nxv6f32_nxv1f32( @vector_interleave_nxv12f32_nxv2f32( @vector_interleave_nxv12f32_nxv2f32( @vector_interleave_nxv24f32_nxv4f32( @vector_interleave_nxv24f32_nxv4f32( @vector_interleave_nxv24f32_nxv4f32( @vector_interleave_nxv24f32_nxv4f32( @vector_interleave_nxv24f32_nxv4f32( @vector_interleave_nxv6f64_nxv1f64( @vector_interleave_nxv6f64_nxv1f64( @vector_interleave_nxv12f64_nxv2f64( @vector_interleave_nxv12f64_nxv2f64( @vector_interleave_nxv12f64_nxv2f64( @vector_interleave_nxv12f64_nxv2f64( @vector_interleave_nxv12f64_nxv2f64( @vector_interleave_nxv14f16_nxv2f16( @vector_interleave_nxv14f16_nxv2f16( @vector_interleave_nxv28f16_nxv4f16( @vector_interleave_nxv28f16_nxv4f16( @vector_interleave_nxv56f16_nxv8f16( @vector_interleave_nxv56f16_nxv8f16( @vector_interleave_nxv56f16_nxv8f16( @vector_interleave_nxv56f16_nxv8f16( @vector_interleave_nxv56f16_nxv8f16( @vector_interleave_nxv14bf16_nxv2bf16( @vector_interleave_nxv14bf16_nxv2bf16( @vector_interleave_nxv28bf16_nxv4bf16( @vector_interleave_nxv28bf16_nxv4bf16( @vector_interleave_nxv56bf16_nxv8bf16( @vector_interleave_nxv56bf16_nxv8bf16( @vector_interleave_nxv56bf16_nxv8bf16( @vector_interleave_nxv56bf16_nxv8bf16( @vector_interleave_nxv56bf16_nxv8bf16( @vector_interleave_nxv7f32_nxv1f32( @vector_interleave_nxv7f32_nxv1f32( @vector_interleave_nxv14f32_nxv2f32( @vector_interleave_nxv14f32_nxv2f32( @vector_interleave_nxv28f32_nxv4f32( @vector_interleave_nxv28f32_nxv4f32( @vector_interleave_nxv28f32_nxv4f32( @vector_interleave_nxv28f32_nxv4f32( @vector_interleave_nxv28f32_nxv4f32( @vector_interleave_nxv7f64_nxv1f64( @vector_interleave_nxv7f64_nxv1f64( @vector_interleave_nxv14f64_nxv2f64( @vector_interleave_nxv14f64_nxv2f64( @vector_interleave_nxv14f64_nxv2f64( @vector_interleave_nxv14f64_nxv2f64( @vector_interleave_nxv14f64_nxv2f64( @vector_interleave_nxv16f16_nxv2f16( @vector_interleave_nxv16f16_nxv2f16( @vector_interleave_nxv32f16_nxv4f16( @vector_interleave_nxv32f16_nxv4f16( @vector_interleave_nxv64f16_nxv8f16( @vector_interleave_nxv64f16_nxv8f16( @vector_interleave_nxv16bf16_nxv2bf16( @vector_interleave_nxv16bf16_nxv2bf16( @vector_interleave_nxv32bf16_nxv4bf16( @vector_interleave_nxv32bf16_nxv4bf16( @vector_interleave_nxv64bf16_nxv8bf16( @vector_interleave_nxv64bf16_nxv8bf16( @vector_interleave_nxv8f32_nxv1f32( @vector_interleave_nxv8f32_nxv1f32( @vector_interleave_nxv16f32_nxv2f32( @vector_interleave_nxv16f32_nxv2f32( @vector_interleave_nxv32f32_nxv4f32( @vector_interleave_nxv32f32_nxv4f32( @vector_interleave_nxv8f64_nxv1f64( @vector_interleave_nxv8f64_nxv1f64( @vector_interleave_nxv16f64_nxv2f64( @vector_interleave_nxv16f64_nxv2f64( @vfadd_vv_nxv32bf16( %va, @vfadd_vf_nxv32bf16( %va, bf ; CHECK-NEXT: fmv.x.h a0, fa0 ; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma ; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfadd.vv v16, v16, v0 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfadd.vv v0, v8, v0 -; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0 -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfadd.vv v16, v24, v16 +; CHECK-NEXT: vfadd.vv v24, v8, v24 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -533,11 +529,9 @@ define @vfadd_vv_nxv32f16( %va, @vfadd_vf_nxv32f16( %va, half %b ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 -; ZVFHMIN-NEXT: addi a1, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 +; ZVFHMIN-NEXT: addi a1, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v8, a0 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfadd.vv v16, v16, v0 ; ZVFHMIN-NEXT: addi a0, sp, 16 ; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfadd.vv v0, v8, v0 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v0 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfadd.vv v16, v24, v16 +; ZVFHMIN-NEXT: vfadd.vv v24, v8, v24 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: add sp, sp, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/vfadd-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfadd-sdnode.ll index 061b2b0c5ab37..58db2f2617764 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfadd-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfadd-sdnode.ll @@ -208,11 +208,9 @@ define @vfadd_vv_nxv32bf16( %va, @vfadd_vf_nxv32bf16( %va, bf ; CHECK-NEXT: fmv.x.h a0, fa0 ; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma ; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfadd.vv v16, v16, v0 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfadd.vv v0, v8, v0 -; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0 -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfadd.vv v16, v24, v16 +; CHECK-NEXT: vfadd.vv v24, v8, v24 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -536,11 +532,9 @@ define @vfadd_vv_nxv32f16( %va, @vfadd_vf_nxv32f16( %va, half %b ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 -; ZVFHMIN-NEXT: addi a1, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 +; ZVFHMIN-NEXT: addi a1, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v8, a0 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfadd.vv v16, v16, v0 ; ZVFHMIN-NEXT: addi a0, sp, 16 ; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfadd.vv v0, v8, v0 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v0 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfadd.vv v16, v24, v16 +; ZVFHMIN-NEXT: vfadd.vv v24, v8, v24 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: add sp, sp, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/vfdiv-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfdiv-constrained-sdnode.ll index 818b882a402ac..630d62a64d2f9 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfdiv-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfdiv-constrained-sdnode.ll @@ -221,16 +221,14 @@ define @vfdiv_vv_nxv32bf16( %va, @vfdiv_vf_nxv32bf16( %va, bf ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: fmv.x.h a0, fa0 ; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 ; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; CHECK-NEXT: vmv.v.x v16, a0 +; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v16 -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfdiv.vv v24, v16, v0 -; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v12 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vfdiv.vv v16, v16, v0 +; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfdiv.vv v16, v0, v8 +; CHECK-NEXT: vfdiv.vv v24, v8, v24 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 -; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -583,16 +569,14 @@ define @vfdiv_vv_nxv32f16( %va, @vfdiv_vf_nxv32f16( %va, half %b ; ZVFHMIN-NEXT: addi sp, sp, -16 ; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 +; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: sub sp, sp, a0 -; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: addi a1, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v16, a0 +; ZVFHMIN-NEXT: vmv.v.x v8, a0 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v16 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfdiv.vv v24, v16, v0 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v12 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vfdiv.vv v16, v16, v0 +; ZVFHMIN-NEXT: addi a0, sp, 16 ; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfdiv.vv v16, v0, v8 +; ZVFHMIN-NEXT: vfdiv.vv v24, v8, v24 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24 -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 +; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: add sp, sp, a0 ; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 ; ZVFHMIN-NEXT: addi sp, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/vfdiv-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfdiv-sdnode.ll index 60b49874ceaea..cc3b953746d3d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfdiv-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfdiv-sdnode.ll @@ -200,16 +200,14 @@ define @vfdiv_vv_nxv32bf16( %va, @vfdiv_vv_nxv32bf16( %va, @vfdiv_vf_nxv32bf16( %va, bfloat %b) { ; CHECK-LABEL: vfdiv_vf_nxv32bf16: ; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: fmv.x.h a0, fa0 ; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 ; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma ; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfdiv.vv v16, v16, v0 -; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v12 -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfdiv.vv v24, v24, v0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vfdiv.vv v24, v8, v24 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 ; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %head = insertelement poison, bfloat %b, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -512,16 +524,14 @@ define @vfdiv_vv_nxv32f16( %va, @vfdiv_vf_nxv32f16( %va, half %b ; ; ZVFHMIN-LABEL: vfdiv_vf_nxv32f16: ; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: addi sp, sp, -16 +; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: sub sp, sp, a0 +; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 +; ZVFHMIN-NEXT: addi a1, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v8, a0 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfdiv.vv v16, v16, v0 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v12 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfdiv.vv v24, v24, v0 +; ZVFHMIN-NEXT: addi a0, sp, 16 +; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vfdiv.vv v24, v8, v24 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: add sp, sp, a0 +; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 +; ZVFHMIN-NEXT: addi sp, sp, 16 +; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0 ; ZVFHMIN-NEXT: ret %head = insertelement poison, half %b, i32 0 %splat = shufflevector %head, poison, zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmax-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfmax-sdnode.ll index c06836f129005..e219ba1913872 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfmax-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfmax-sdnode.ll @@ -201,11 +201,9 @@ define @vfmax_nxv32bf16_vv( %a, @vfmax_nxv32bf16_vf( %a, bfl ; CHECK-NEXT: fmv.x.h a0, fa0 ; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma ; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfmax.vv v16, v16, v0 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfmax.vv v0, v8, v0 -; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0 -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfmax.vv v16, v24, v16 +; CHECK-NEXT: vfmax.vv v24, v8, v24 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -518,11 +514,9 @@ define @vfmax_nxv32f16_vv( %a, @vfmax_nxv32f16_vf( %a, half %b) ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 -; ZVFHMIN-NEXT: addi a1, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 +; ZVFHMIN-NEXT: addi a1, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v8, a0 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfmax.vv v16, v16, v0 ; ZVFHMIN-NEXT: addi a0, sp, 16 ; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmax.vv v0, v8, v0 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v0 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmax.vv v16, v24, v16 +; ZVFHMIN-NEXT: vfmax.vv v24, v8, v24 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: add sp, sp, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmin-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfmin-sdnode.ll index 98ccbf03e1841..706212ff2feb5 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfmin-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfmin-sdnode.ll @@ -201,11 +201,9 @@ define @vfmin_nxv32bf16_vv( %a, @vfmin_nxv32bf16_vf( %a, bfl ; CHECK-NEXT: fmv.x.h a0, fa0 ; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma ; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfmin.vv v16, v16, v0 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfmin.vv v0, v8, v0 -; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0 -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfmin.vv v16, v24, v16 +; CHECK-NEXT: vfmin.vv v24, v8, v24 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -518,11 +514,9 @@ define @vfmin_nxv32f16_vv( %a, @vfmin_nxv32f16_vf( %a, half %b) ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 -; ZVFHMIN-NEXT: addi a1, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 +; ZVFHMIN-NEXT: addi a1, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v8, a0 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfmin.vv v16, v16, v0 ; ZVFHMIN-NEXT: addi a0, sp, 16 ; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmin.vv v0, v8, v0 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v0 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmin.vv v16, v24, v16 +; ZVFHMIN-NEXT: vfmin.vv v24, v8, v24 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: add sp, sp, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmul-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfmul-constrained-sdnode.ll index 990d3d4e227df..123fb8bc6f757 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfmul-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfmul-constrained-sdnode.ll @@ -206,11 +206,9 @@ define @vfmul_vv_nxv32bf16( %va, @vfmul_vf_nxv32bf16( %va, bf ; CHECK-NEXT: fmv.x.h a0, fa0 ; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma ; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfmul.vv v16, v16, v0 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfmul.vv v0, v8, v0 -; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0 -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfmul.vv v16, v24, v16 +; CHECK-NEXT: vfmul.vv v24, v8, v24 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -533,11 +529,9 @@ define @vfmul_vv_nxv32f16( %va, @vfmul_vf_nxv32f16( %va, half %b ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 -; ZVFHMIN-NEXT: addi a1, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 +; ZVFHMIN-NEXT: addi a1, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v8, a0 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfmul.vv v16, v16, v0 ; ZVFHMIN-NEXT: addi a0, sp, 16 ; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmul.vv v0, v8, v0 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v0 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmul.vv v16, v24, v16 +; ZVFHMIN-NEXT: vfmul.vv v24, v8, v24 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: add sp, sp, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmul-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfmul-sdnode.ll index f9373400295df..abea4a046d829 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfmul-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfmul-sdnode.ll @@ -208,11 +208,9 @@ define @vfmul_vv_nxv32bf16( %va, @vfmul_vf_nxv32bf16( %va, bf ; CHECK-NEXT: fmv.x.h a0, fa0 ; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma ; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfmul.vv v16, v16, v0 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfmul.vv v0, v8, v0 -; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0 -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfmul.vv v16, v24, v16 +; CHECK-NEXT: vfmul.vv v24, v8, v24 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -536,11 +532,9 @@ define @vfmul_vv_nxv32f16( %va, @vfmul_vf_nxv32f16( %va, half %b ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 -; ZVFHMIN-NEXT: addi a1, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 +; ZVFHMIN-NEXT: addi a1, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v8, a0 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfmul.vv v16, v16, v0 ; ZVFHMIN-NEXT: addi a0, sp, 16 ; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmul.vv v0, v8, v0 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v0 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmul.vv v16, v24, v16 +; ZVFHMIN-NEXT: vfmul.vv v24, v8, v24 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: add sp, sp, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptoi-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfptoi-sdnode.ll index 111fa368ac155..deabf42e9a953 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfptoi-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfptoi-sdnode.ll @@ -673,12 +673,10 @@ define @vfptosi_nxv32bf16_nxv32i8( %va) ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 ; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 ; CHECK-NEXT: vfncvt.rtz.x.f.w v12, v16 +; CHECK-NEXT: vfncvt.rtz.x.f.w v16, v24 ; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, ma ; CHECK-NEXT: vnsrl.wi v8, v12, 0 -; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvt.rtz.x.f.w v12, v24 -; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, ma -; CHECK-NEXT: vnsrl.wi v10, v12, 0 +; CHECK-NEXT: vnsrl.wi v10, v16, 0 ; CHECK-NEXT: ret %evec = fptosi %va to ret %evec @@ -691,12 +689,10 @@ define @vfptoui_nxv32bf16_nxv32i8( %va) ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 ; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 ; CHECK-NEXT: vfncvt.rtz.xu.f.w v12, v16 +; CHECK-NEXT: vfncvt.rtz.xu.f.w v16, v24 ; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, ma ; CHECK-NEXT: vnsrl.wi v8, v12, 0 -; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvt.rtz.xu.f.w v12, v24 -; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, ma -; CHECK-NEXT: vnsrl.wi v10, v12, 0 +; CHECK-NEXT: vnsrl.wi v10, v16, 0 ; CHECK-NEXT: ret %evec = fptoui %va to ret %evec @@ -707,9 +703,9 @@ define @vfptosi_nxv32bf16_nxv32i16( %v ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 ; CHECK-NEXT: vfncvt.rtz.x.f.w v8, v16 -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 -; CHECK-NEXT: vfncvt.rtz.x.f.w v12, v16 +; CHECK-NEXT: vfncvt.rtz.x.f.w v12, v24 ; CHECK-NEXT: ret %evec = fptosi %va to ret %evec @@ -720,9 +716,9 @@ define @vfptoui_nxv32bf16_nxv32i16( %v ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 ; CHECK-NEXT: vfncvt.rtz.xu.f.w v8, v16 -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 -; CHECK-NEXT: vfncvt.rtz.xu.f.w v12, v16 +; CHECK-NEXT: vfncvt.rtz.xu.f.w v12, v24 ; CHECK-NEXT: ret %evec = fptoui %va to ret %evec @@ -1706,12 +1702,10 @@ define @vfptosi_nxv32f16_nxv32i8( %va) { ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vfncvt.rtz.x.f.w v12, v16 +; ZVFHMIN-NEXT: vfncvt.rtz.x.f.w v16, v24 ; ZVFHMIN-NEXT: vsetvli zero, zero, e8, m2, ta, ma ; ZVFHMIN-NEXT: vnsrl.wi v8, v12, 0 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.rtz.x.f.w v12, v24 -; ZVFHMIN-NEXT: vsetvli zero, zero, e8, m2, ta, ma -; ZVFHMIN-NEXT: vnsrl.wi v10, v12, 0 +; ZVFHMIN-NEXT: vnsrl.wi v10, v16, 0 ; ZVFHMIN-NEXT: ret %evec = fptosi %va to ret %evec @@ -1731,12 +1725,10 @@ define @vfptoui_nxv32f16_nxv32i8( %va) { ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vfncvt.rtz.xu.f.w v12, v16 +; ZVFHMIN-NEXT: vfncvt.rtz.xu.f.w v16, v24 ; ZVFHMIN-NEXT: vsetvli zero, zero, e8, m2, ta, ma ; ZVFHMIN-NEXT: vnsrl.wi v8, v12, 0 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.rtz.xu.f.w v12, v24 -; ZVFHMIN-NEXT: vsetvli zero, zero, e8, m2, ta, ma -; ZVFHMIN-NEXT: vnsrl.wi v10, v12, 0 +; ZVFHMIN-NEXT: vnsrl.wi v10, v16, 0 ; ZVFHMIN-NEXT: ret %evec = fptoui %va to ret %evec @@ -1753,9 +1745,9 @@ define @vfptosi_nxv32f16_nxv32i16( %va) ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vfncvt.rtz.x.f.w v8, v16 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 -; ZVFHMIN-NEXT: vfncvt.rtz.x.f.w v12, v16 +; ZVFHMIN-NEXT: vfncvt.rtz.x.f.w v12, v24 ; ZVFHMIN-NEXT: ret %evec = fptosi %va to ret %evec @@ -1772,9 +1764,9 @@ define @vfptoui_nxv32f16_nxv32i16( %va) ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vfncvt.rtz.xu.f.w v8, v16 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 -; ZVFHMIN-NEXT: vfncvt.rtz.xu.f.w v12, v16 +; ZVFHMIN-NEXT: vfncvt.rtz.xu.f.w v12, v24 ; ZVFHMIN-NEXT: ret %evec = fptoui %va to ret %evec diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-constrained-sdnode.ll index eeb5f3bc984d3..9a9d833acefb8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-constrained-sdnode.ll @@ -92,11 +92,9 @@ define @vfsqrt_nxv32bf16( %v) stric ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfsqrt.v v16, v16 -; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfsqrt.v v16, v16 ; CHECK-NEXT: vfsqrt.v v24, v24 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 @@ -229,11 +227,9 @@ define @vfsqrt_nxv32f16( %v) strictfp { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfsqrt.v v16, v16 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfsqrt.v v16, v16 ; ZVFHMIN-NEXT: vfsqrt.v v24, v24 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-sdnode.ll index 6d7662db2b157..619897b10b6df 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-sdnode.ll @@ -87,11 +87,9 @@ define @vfsqrt_nxv32bf16( %v) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfsqrt.v v16, v16 -; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfsqrt.v v16, v16 ; CHECK-NEXT: vfsqrt.v v24, v24 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 @@ -224,11 +222,9 @@ define @vfsqrt_nxv32f16( %v) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfsqrt.v v16, v16 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfsqrt.v v16, v16 ; ZVFHMIN-NEXT: vfsqrt.v v24, v24 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsub-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfsub-constrained-sdnode.ll index cd8f890251c77..cc0109c09b34e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfsub-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfsub-constrained-sdnode.ll @@ -225,11 +225,9 @@ define @vfsub_vv_nxv32bf16( %va, @vfsub_vf_nxv32bf16( %va, bf ; CHECK-NEXT: fmv.x.h a0, fa0 ; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma ; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfsub.vv v16, v16, v0 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfsub.vv v0, v8, v0 -; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0 -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfsub.vv v16, v24, v16 +; CHECK-NEXT: vfsub.vv v24, v8, v24 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -577,11 +573,9 @@ define @vfsub_vv_nxv32f16( %va, @vfsub_vf_nxv32f16( %va, half %b ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 -; ZVFHMIN-NEXT: addi a1, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 +; ZVFHMIN-NEXT: addi a1, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v8, a0 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfsub.vv v16, v16, v0 ; ZVFHMIN-NEXT: addi a0, sp, 16 ; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfsub.vv v0, v8, v0 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v0 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfsub.vv v16, v24, v16 +; ZVFHMIN-NEXT: vfsub.vv v24, v8, v24 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: add sp, sp, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsub-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfsub-sdnode.ll index 550d8aad3ee20..df3b9971cc8fe 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfsub-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfsub-sdnode.ll @@ -208,11 +208,9 @@ define @vfsub_vv_nxv32bf16( %va, @vfsub_vf_nxv32bf16( %va, bf ; CHECK-NEXT: fmv.x.h a0, fa0 ; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma ; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfsub.vv v16, v16, v0 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfsub.vv v0, v8, v0 -; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0 -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfsub.vv v16, v24, v16 +; CHECK-NEXT: vfsub.vv v24, v8, v24 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -536,11 +532,9 @@ define @vfsub_vv_nxv32f16( %va, @vfsub_vf_nxv32f16( %va, half %b ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 -; ZVFHMIN-NEXT: addi a1, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 +; ZVFHMIN-NEXT: addi a1, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v8, a0 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfsub.vv v16, v16, v0 ; ZVFHMIN-NEXT: addi a0, sp, 16 ; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfsub.vv v0, v8, v0 -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v0 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfsub.vv v16, v24, v16 +; ZVFHMIN-NEXT: vfsub.vv v24, v8, v24 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: add sp, sp, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/vitofp-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vitofp-sdnode.ll index 95b1c35d48bb3..d900452fe6455 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vitofp-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vitofp-sdnode.ll @@ -339,11 +339,11 @@ define @vsitofp_nxv32i8_nxv32bf16( %va) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vsext.vf2 v12, v8 -; CHECK-NEXT: vsext.vf2 v24, v10 +; CHECK-NEXT: vsext.vf2 v4, v10 ; CHECK-NEXT: vfwcvt.f.x.v v16, v12 +; CHECK-NEXT: vfwcvt.f.x.v v24, v4 ; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 -; CHECK-NEXT: vfwcvt.f.x.v v16, v24 -; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 ; CHECK-NEXT: ret %evec = sitofp %va to ret %evec @@ -354,11 +354,11 @@ define @vuitofp_nxv32i8_nxv32bf16( %va) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vzext.vf2 v12, v8 -; CHECK-NEXT: vzext.vf2 v24, v10 +; CHECK-NEXT: vzext.vf2 v4, v10 ; CHECK-NEXT: vfwcvt.f.xu.v v16, v12 +; CHECK-NEXT: vfwcvt.f.xu.v v24, v4 ; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 -; CHECK-NEXT: vfwcvt.f.xu.v v16, v24 -; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 ; CHECK-NEXT: ret %evec = uitofp %va to ret %evec @@ -479,9 +479,9 @@ define @vsitofp_nxv32i16_nxv32bf16( %v ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvt.f.x.v v16, v8 +; CHECK-NEXT: vfwcvt.f.x.v v24, v12 ; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 -; CHECK-NEXT: vfwcvt.f.x.v v16, v12 -; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 ; CHECK-NEXT: ret %evec = sitofp %va to ret %evec @@ -492,9 +492,9 @@ define @vuitofp_nxv32i16_nxv32bf16( %v ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvt.f.xu.v v16, v8 +; CHECK-NEXT: vfwcvt.f.xu.v v24, v12 ; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 -; CHECK-NEXT: vfwcvt.f.xu.v v16, v12 -; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 ; CHECK-NEXT: ret %evec = uitofp %va to ret %evec @@ -1646,11 +1646,11 @@ define @vsitofp_nxv32i8_nxv32f16( %va) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vsext.vf2 v12, v8 -; ZVFHMIN-NEXT: vsext.vf2 v24, v10 +; ZVFHMIN-NEXT: vsext.vf2 v4, v10 ; ZVFHMIN-NEXT: vfwcvt.f.x.v v16, v12 +; ZVFHMIN-NEXT: vfwcvt.f.x.v v24, v4 ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 -; ZVFHMIN-NEXT: vfwcvt.f.x.v v16, v24 -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 ; ZVFHMIN-NEXT: ret %evec = sitofp %va to ret %evec @@ -1668,11 +1668,11 @@ define @vuitofp_nxv32i8_nxv32f16( %va) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vzext.vf2 v12, v8 -; ZVFHMIN-NEXT: vzext.vf2 v24, v10 +; ZVFHMIN-NEXT: vzext.vf2 v4, v10 ; ZVFHMIN-NEXT: vfwcvt.f.xu.v v16, v12 +; ZVFHMIN-NEXT: vfwcvt.f.xu.v v24, v4 ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 -; ZVFHMIN-NEXT: vfwcvt.f.xu.v v16, v24 -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 ; ZVFHMIN-NEXT: ret %evec = uitofp %va to ret %evec @@ -2057,9 +2057,9 @@ define @vsitofp_nxv32i16_nxv32f16( %va) ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.x.v v16, v8 +; ZVFHMIN-NEXT: vfwcvt.f.x.v v24, v12 ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 -; ZVFHMIN-NEXT: vfwcvt.f.x.v v16, v12 -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 ; ZVFHMIN-NEXT: ret %evec = sitofp %va to ret %evec @@ -2076,9 +2076,9 @@ define @vuitofp_nxv32i16_nxv32f16( %va) ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.xu.v v16, v8 +; ZVFHMIN-NEXT: vfwcvt.f.xu.v v24, v12 ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 -; ZVFHMIN-NEXT: vfwcvt.f.xu.v v16, v12 -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 ; ZVFHMIN-NEXT: ret %evec = uitofp %va to ret %evec diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll index d7d767e600db5..5d390473ec971 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll @@ -218,21 +218,23 @@ define void @store_factor3_v2( %v0, %v1, %v0, %v1, ptr %ptr, i32 zeroext %evl) { ; RV32-LABEL: store_factor4_v2: ; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV32-NEXT: vmv1r.v v10, v8 +; RV32-NEXT: vmv1r.v v11, v9 ; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: srli a1, a1, 2 ; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; RV32-NEXT: vmv1r.v v10, v8 -; RV32-NEXT: vmv1r.v v11, v9 ; RV32-NEXT: vsseg4e32.v v8, (a0) ; RV32-NEXT: ret ; ; RV64-LABEL: store_factor4_v2: ; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64-NEXT: vmv1r.v v10, v8 +; RV64-NEXT: vmv1r.v v11, v9 ; RV64-NEXT: slli a1, a1, 35 ; RV64-NEXT: srli a1, a1, 34 ; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; RV64-NEXT: vmv1r.v v10, v8 -; RV64-NEXT: vmv1r.v v11, v9 ; RV64-NEXT: vsseg4e32.v v8, (a0) ; RV64-NEXT: ret %rvl = mul nuw i32 %evl, 8