diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 329b42d621cee..01d5f01f535e4 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -5155,6 +5155,28 @@ static SDValue lowerShuffleViaVRegSplitting(ShuffleVectorSDNode *SVN, return convertFromScalableVector(VT, Vec, DAG, Subtarget); } +// Matches a subset of compress masks with a contiguous prefix of output +// elements. This could be extended to allow gaps by deciding which +// source elements to spuriously demand. +static bool isCompressMask(ArrayRef Mask) { + int Last = -1; + bool SawUndef = false; + for (int i = 0; i < Mask.size(); i++) { + if (Mask[i] == -1) { + SawUndef = true; + continue; + } + if (SawUndef) + return false; + if (i > Mask[i]) + return false; + if (Mask[i] <= Last) + return false; + Last = Mask[i]; + } + return true; +} + static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const RISCVSubtarget &Subtarget) { SDValue V1 = Op.getOperand(0); @@ -5372,6 +5394,25 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, if (SDValue V = lowerVECTOR_SHUFFLEAsRotate(SVN, DAG, Subtarget)) return V; + // Can we generate a vcompress instead of a vrgather? These scale better + // at high LMUL, at the cost of not being able to fold a following select + // into them. The mask constants are also smaller than the index vector + // constants, and thus easier to materialize. + if (isCompressMask(Mask)) { + SmallVector MaskVals(NumElts, + DAG.getConstant(false, DL, XLenVT)); + for (auto Idx : Mask) { + if (Idx == -1) + break; + assert(Idx >= 0 && (unsigned)Idx < NumElts); + MaskVals[Idx] = DAG.getConstant(true, DL, XLenVT); + } + MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts); + SDValue CompressMask = DAG.getBuildVector(MaskVT, DL, MaskVals); + return DAG.getNode(ISD::VECTOR_COMPRESS, DL, VT, V1, CompressMask, + DAG.getUNDEF(VT)); + } + if (VT.getScalarSizeInBits() == 8 && any_of(Mask, [&](const auto &Idx) { return Idx > 255; })) { // On such a vector we're unable to use i8 as the index type. diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll index b0f8bc9dcc6bd..e82891f90d85e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll @@ -40,16 +40,16 @@ define <4 x float> @hang_when_merging_stores_after_legalization(<8 x float> %x, ; CHECK-LABEL: hang_when_merging_stores_after_legalization: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: vmv.v.i v12, -14 +; CHECK-NEXT: vid.v v14 ; CHECK-NEXT: li a0, 7 +; CHECK-NEXT: vmadd.vx v14, a0, v12 +; CHECK-NEXT: li a0, 129 +; CHECK-NEXT: vmv.s.x v15, a0 ; CHECK-NEXT: vmv.v.i v0, 12 -; CHECK-NEXT: vmul.vx v14, v12, a0 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vrgatherei16.vv v12, v8, v14 -; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; CHECK-NEXT: vadd.vi v8, v14, -14 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; CHECK-NEXT: vrgatherei16.vv v12, v10, v8, v0.t +; CHECK-NEXT: vcompress.vm v12, v8, v15 +; CHECK-NEXT: vrgatherei16.vv v12, v10, v14, v0.t ; CHECK-NEXT: vmv1r.v v8, v12 ; CHECK-NEXT: ret %z = shufflevector <8 x float> %x, <8 x float> %y, <4 x i32> diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll index c803b15913bb3..0db45ae71bc8a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll @@ -138,17 +138,17 @@ define <4 x double> @vrgather_shuffle_xv_v4f64(<4 x double> %x) { define <4 x double> @vrgather_shuffle_vx_v4f64(<4 x double> %x) { ; CHECK-LABEL: vrgather_shuffle_vx_v4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vid.v v10 ; CHECK-NEXT: lui a0, %hi(.LCPI9_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI9_0)(a0) -; CHECK-NEXT: li a0, 3 -; CHECK-NEXT: vmul.vx v12, v10, a0 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; CHECK-NEXT: vmv.v.i v10, 9 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vcompress.vm v12, v8, v10 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma ; CHECK-NEXT: vmv.v.i v0, 3 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfmv.v.f v10, fa5 -; CHECK-NEXT: vrgatherei16.vv v10, v8, v12, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vfmv.v.f v8, fa5 +; CHECK-NEXT: vmerge.vvm v8, v8, v12, v0 ; CHECK-NEXT: ret %s = shufflevector <4 x double> %x, <4 x double> , <4 x i32> ret <4 x double> %s diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll index dbfe7bb51dbff..5b01eae1ba3c0 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll @@ -113,14 +113,12 @@ define <4 x i16> @vrgather_shuffle_xv_v4i16(<4 x i16> %x) { define <4 x i16> @vrgather_shuffle_vx_v4i16(<4 x i16> %x) { ; CHECK-LABEL: vrgather_shuffle_vx_v4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; CHECK-NEXT: vid.v v9 -; CHECK-NEXT: li a0, 3 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vmv.v.i v9, 9 ; CHECK-NEXT: vmv.v.i v0, 3 -; CHECK-NEXT: vmul.vx v10, v9, a0 -; CHECK-NEXT: vmv.v.i v9, 5 -; CHECK-NEXT: vrgather.vv v9, v8, v10, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: vcompress.vm v10, v8, v9 +; CHECK-NEXT: vmv.v.i v8, 5 +; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0 ; CHECK-NEXT: ret %s = shufflevector <4 x i16> %x, <4 x i16> , <4 x i32> ret <4 x i16> %s @@ -723,21 +721,22 @@ define <8 x i32> @shuffle_v8i32_2(<8 x i32> %x, <8 x i32> %y) { define <8 x i8> @shuffle_v64i8_v8i8(<64 x i8> %wide.vec) { ; CHECK-LABEL: shuffle_v64i8_v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: lui a0, 4112 ; CHECK-NEXT: li a1, 240 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-NEXT: vmv.s.x v0, a1 -; CHECK-NEXT: lui a1, 98561 -; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma -; CHECK-NEXT: vid.v v12 -; CHECK-NEXT: vsll.vi v14, v12, 3 -; CHECK-NEXT: vrgather.vv v12, v8, v14 -; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: addi a1, a1, -2048 +; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: addi a0, a0, 257 +; CHECK-NEXT: vmv.s.x v14, a0 +; CHECK-NEXT: lui a0, 98561 +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vcompress.vm v12, v8, v14 +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v8, a1 +; CHECK-NEXT: addi a0, a0, -2048 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vmv.v.x v10, a1 -; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, mu +; CHECK-NEXT: vmv.v.x v10, a0 +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu ; CHECK-NEXT: vrgather.vv v12, v8, v10, v0.t ; CHECK-NEXT: vmv1r.v v8, v12 ; CHECK-NEXT: ret @@ -748,11 +747,10 @@ define <8 x i8> @shuffle_v64i8_v8i8(<64 x i8> %wide.vec) { define <8 x i8> @shuffle_compress_singlesrc_e8(<8 x i8> %v) { ; CHECK-LABEL: shuffle_compress_singlesrc_e8: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI49_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI49_0) +; CHECK-NEXT: li a0, 181 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vle8.v v10, (a0) -; CHECK-NEXT: vrgather.vv v9, v8, v10 +; CHECK-NEXT: vmv.s.x v10, a0 +; CHECK-NEXT: vcompress.vm v9, v8, v10 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %out = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> @@ -762,11 +760,10 @@ define <8 x i8> @shuffle_compress_singlesrc_e8(<8 x i8> %v) { define <8 x i16> @shuffle_compress_singlesrc_e16(<8 x i16> %v) { ; CHECK-LABEL: shuffle_compress_singlesrc_e16: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI50_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI50_0) +; CHECK-NEXT: li a0, 181 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v10, (a0) -; CHECK-NEXT: vrgather.vv v9, v8, v10 +; CHECK-NEXT: vmv.s.x v10, a0 +; CHECK-NEXT: vcompress.vm v9, v8, v10 ; CHECK-NEXT: vmv.v.v v8, v9 ; CHECK-NEXT: ret %out = shufflevector <8 x i16> %v, <8 x i16> poison, <8 x i32> @@ -776,11 +773,10 @@ define <8 x i16> @shuffle_compress_singlesrc_e16(<8 x i16> %v) { define <8 x i32> @shuffle_compress_singlesrc_e32(<8 x i32> %v) { ; CHECK-LABEL: shuffle_compress_singlesrc_e32: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI51_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI51_0) +; CHECK-NEXT: li a0, 115 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vle16.v v12, (a0) -; CHECK-NEXT: vrgatherei16.vv v10, v8, v12 +; CHECK-NEXT: vmv.s.x v12, a0 +; CHECK-NEXT: vcompress.vm v10, v8, v12 ; CHECK-NEXT: vmv.v.v v8, v10 ; CHECK-NEXT: ret %out = shufflevector <8 x i32> %v, <8 x i32> poison, <8 x i32> @@ -790,11 +786,10 @@ define <8 x i32> @shuffle_compress_singlesrc_e32(<8 x i32> %v) { define <8 x i64> @shuffle_compress_singlesrc_e64(<8 x i64> %v) { ; CHECK-LABEL: shuffle_compress_singlesrc_e64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI52_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI52_0) +; CHECK-NEXT: li a0, 181 ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vrgatherei16.vv v12, v8, v16 +; CHECK-NEXT: vmv.s.x v16, a0 +; CHECK-NEXT: vcompress.vm v12, v8, v16 ; CHECK-NEXT: vmv.v.v v8, v12 ; CHECK-NEXT: ret %out = shufflevector <8 x i64> %v, <8 x i64> poison, <8 x i32> diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll index fa1377406d697..651674ee9a502 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll @@ -183,463 +183,406 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 81 -; RV32-NEXT: mul a2, a2, a3 +; RV32-NEXT: slli a3, a2, 6 +; RV32-NEXT: add a2, a3, a2 ; RV32-NEXT: sub sp, sp, a2 -; RV32-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xd1, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 81 * vlenb -; RV32-NEXT: addi a3, a1, 128 -; RV32-NEXT: addi a4, a1, 256 +; RV32-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc1, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 65 * vlenb +; RV32-NEXT: addi a3, a1, 256 +; RV32-NEXT: addi a4, a1, 128 ; RV32-NEXT: li a2, 32 -; RV32-NEXT: lui a5, 12 -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vle32.v v16, (a4) +; RV32-NEXT: lui a5, 12291 +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu +; RV32-NEXT: vle32.v v24, (a1) +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a6, 41 +; RV32-NEXT: mul a1, a1, a6 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; RV32-NEXT: lui a1, %hi(.LCPI8_0) +; RV32-NEXT: addi a1, a1, %lo(.LCPI8_0) +; RV32-NEXT: vle16.v v4, (a1) +; RV32-NEXT: lui a1, 1 +; RV32-NEXT: addi a5, a5, 3 +; RV32-NEXT: vle32.v v8, (a4) ; RV32-NEXT: csrr a4, vlenb ; RV32-NEXT: li a6, 57 ; RV32-NEXT: mul a4, a4, a6 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill -; RV32-NEXT: lui a4, %hi(.LCPI8_0) -; RV32-NEXT: addi a4, a4, %lo(.LCPI8_0) -; RV32-NEXT: vmv.s.x v1, a5 -; RV32-NEXT: lui a5, %hi(.LCPI8_1) -; RV32-NEXT: addi a5, a5, %lo(.LCPI8_1) -; RV32-NEXT: vle16.v v4, (a4) -; RV32-NEXT: lui a4, 1 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vslideup.vi v12, v16, 4 -; RV32-NEXT: csrr a6, vlenb -; RV32-NEXT: li a7, 37 -; RV32-NEXT: mul a6, a6, a7 -; RV32-NEXT: add a6, sp, a6 -; RV32-NEXT: addi a6, a6, 16 -; RV32-NEXT: vs4r.v v12, (a6) # Unknown-size Folded Spill -; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV32-NEXT: vslidedown.vi v16, v16, 16 -; RV32-NEXT: csrr a6, vlenb -; RV32-NEXT: li a7, 45 -; RV32-NEXT: mul a6, a6, a7 -; RV32-NEXT: add a6, sp, a6 -; RV32-NEXT: addi a6, a6, 16 -; RV32-NEXT: vs8r.v v16, (a6) # Unknown-size Folded Spill -; RV32-NEXT: vmv1r.v v0, v1 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; RV32-NEXT: vslideup.vi v12, v16, 10, v0.t -; RV32-NEXT: vmv.v.v v28, v12 -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu -; RV32-NEXT: vle16.v v24, (a5) -; RV32-NEXT: vle32.v v8, (a1) -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a5, a1, 6 -; RV32-NEXT: add a1, a5, a1 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill +; RV32-NEXT: addi a1, a1, -64 ; RV32-NEXT: vle32.v v16, (a3) -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 73 -; RV32-NEXT: mul a1, a1, a3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV32-NEXT: addi a1, a4, -64 +; RV32-NEXT: vmv.s.x v3, a5 ; RV32-NEXT: vmv.s.x v0, a1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 5 -; RV32-NEXT: add a1, a3, a1 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vrgatherei16.vv v16, v8, v4 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 73 +; RV32-NEXT: li a3, 13 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v16, v8, v24, v0.t -; RV32-NEXT: vsetivli zero, 12, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v28, v16 +; RV32-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vcompress.vm v8, v24, v3 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 41 +; RV32-NEXT: li a3, 57 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v28, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v8, v24, v4, v0.t +; RV32-NEXT: lui a1, 12 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: li a4, 49 +; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-NEXT: vslideup.vi v12, v16, 4 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a4, a3, 4 +; RV32-NEXT: add a3, a4, a3 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: vs4r.v v12, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vmv.s.x v0, a1 +; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; RV32-NEXT: vslidedown.vi v24, v16, 16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 57 +; RV32-NEXT: li a3, 25 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; RV32-NEXT: vslideup.vi v12, v8, 2 -; RV32-NEXT: vmv1r.v v0, v1 +; RV32-NEXT: vslideup.vi v12, v24, 10, v0.t ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 45 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a3, a1, 5 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vslideup.vi v12, v16, 8, v0.t +; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vsetivli zero, 12, e32, m4, tu, ma +; RV32-NEXT: vmv.v.v v12, v8 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 53 +; RV32-NEXT: li a3, 21 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill -; RV32-NEXT: lui a1, %hi(.LCPI8_2) -; RV32-NEXT: addi a1, a1, %lo(.LCPI8_2) -; RV32-NEXT: lui a3, %hi(.LCPI8_3) -; RV32-NEXT: addi a3, a3, %lo(.LCPI8_3) -; RV32-NEXT: vsetvli zero, a2, e16, m4, ta, ma -; RV32-NEXT: vle16.v v12, (a1) -; RV32-NEXT: vle16.v v8, (a3) +; RV32-NEXT: lui a1, 49164 +; RV32-NEXT: lui a3, %hi(.LCPI8_1) +; RV32-NEXT: addi a3, a3, %lo(.LCPI8_1) +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu +; RV32-NEXT: vle16.v v28, (a3) +; RV32-NEXT: addi a1, a1, 12 +; RV32-NEXT: vmv.s.x v20, a1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 25 +; RV32-NEXT: li a3, 41 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill -; RV32-NEXT: lui a1, %hi(.LCPI8_4) -; RV32-NEXT: addi a1, a1, %lo(.LCPI8_4) -; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV32-NEXT: vle16.v v2, (a1) -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 6 -; RV32-NEXT: add a1, a3, a1 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu -; RV32-NEXT: vrgatherei16.vv v24, v16, v12 +; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vcompress.vm v8, v0, v20 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 5 -; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: li a3, 13 +; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 73 +; RV32-NEXT: li a3, 57 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v8, v16, v28, v0.t ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 25 +; RV32-NEXT: li a3, 49 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v4, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v24, v8, v4, v0.t +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; RV32-NEXT: vslideup.vi v12, v16, 2 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 53 +; RV32-NEXT: li a3, 25 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vslideup.vi v12, v24, 8, v0.t ; RV32-NEXT: vsetivli zero, 12, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v8, v24 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 53 -; RV32-NEXT: mul a1, a1, a3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vmv.v.v v12, v8 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 57 +; RV32-NEXT: li a3, 13 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; RV32-NEXT: vrgatherei16.vv v8, v24, v2 -; RV32-NEXT: vmv1r.v v0, v1 +; RV32-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill +; RV32-NEXT: lui a1, 196656 +; RV32-NEXT: lui a3, %hi(.LCPI8_2) +; RV32-NEXT: addi a3, a3, %lo(.LCPI8_2) +; RV32-NEXT: li a4, 960 +; RV32-NEXT: lui a5, %hi(.LCPI8_3) +; RV32-NEXT: addi a5, a5, %lo(.LCPI8_3) +; RV32-NEXT: addi a1, a1, 48 +; RV32-NEXT: vmv.s.x v0, a4 +; RV32-NEXT: csrr a4, vlenb +; RV32-NEXT: add a4, sp, a4 +; RV32-NEXT: addi a4, a4, 16 +; RV32-NEXT: vs1r.v v0, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a2, e16, m4, ta, ma +; RV32-NEXT: vle16.v v4, (a3) +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-NEXT: vle16.v v8, (a5) +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a4, a3, 3 +; RV32-NEXT: add a3, a4, a3 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: vs2r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vmv.s.x v22, a1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 45 +; RV32-NEXT: li a3, 41 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vslideup.vi v8, v24, 6, v0.t -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 5 -; RV32-NEXT: add a1, a3, a1 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill -; RV32-NEXT: lui a1, %hi(.LCPI8_5) -; RV32-NEXT: addi a1, a1, %lo(.LCPI8_5) -; RV32-NEXT: lui a3, %hi(.LCPI8_6) -; RV32-NEXT: addi a3, a3, %lo(.LCPI8_6) ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu -; RV32-NEXT: vle16.v v24, (a1) -; RV32-NEXT: vle16.v v4, (a3) -; RV32-NEXT: li a1, 960 -; RV32-NEXT: vmv.s.x v28, a1 -; RV32-NEXT: vrgatherei16.vv v8, v16, v24 -; RV32-NEXT: vmv1r.v v0, v28 +; RV32-NEXT: vcompress.vm v8, v24, v22 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 73 +; RV32-NEXT: li a3, 57 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vrgatherei16.vv v8, v16, v4, v0.t ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 25 +; RV32-NEXT: li a3, 49 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; RV32-NEXT: lui a1, %hi(.LCPI8_7) -; RV32-NEXT: addi a1, a1, %lo(.LCPI8_7) -; RV32-NEXT: lui a3, %hi(.LCPI8_8) -; RV32-NEXT: addi a3, a3, %lo(.LCPI8_8) -; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV32-NEXT: vle16.v v8, (a1) -; RV32-NEXT: lui a1, %hi(.LCPI8_9) -; RV32-NEXT: addi a1, a1, %lo(.LCPI8_9) -; RV32-NEXT: vsetvli zero, a2, e16, m4, ta, ma -; RV32-NEXT: vle16.v v12, (a3) -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 13 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs4r.v v12, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vle16.v v12, (a1) +; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 2 +; RV32-NEXT: slli a3, a1, 3 ; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 57 -; RV32-NEXT: mul a1, a1, a3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl2r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; RV32-NEXT: vrgatherei16.vv v20, v16, v8 -; RV32-NEXT: vmv1r.v v0, v1 +; RV32-NEXT: vrgatherei16.vv v12, v0, v16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 45 +; RV32-NEXT: li a3, 25 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vmv4r.v v24, v8 -; RV32-NEXT: vslideup.vi v20, v8, 4, v0.t +; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 21 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a3, a1, 5 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v20, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vslideup.vi v12, v16, 6, v0.t +; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma +; RV32-NEXT: vmv.v.v v12, v8 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 6 +; RV32-NEXT: slli a3, a1, 3 ; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill +; RV32-NEXT: lui a1, 786624 +; RV32-NEXT: lui a3, %hi(.LCPI8_4) +; RV32-NEXT: addi a3, a3, %lo(.LCPI8_4) +; RV32-NEXT: lui a4, %hi(.LCPI8_5) +; RV32-NEXT: addi a4, a4, %lo(.LCPI8_5) +; RV32-NEXT: addi a1, a1, 192 +; RV32-NEXT: vsetvli zero, a2, e16, m4, ta, ma +; RV32-NEXT: vle16.v v8, (a3) +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-NEXT: vle16.v v12, (a4) +; RV32-NEXT: vmv.s.x v14, a1 +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu +; RV32-NEXT: vcompress.vm v16, v24, v14 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 13 -; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu -; RV32-NEXT: vrgatherei16.vv v8, v0, v16 -; RV32-NEXT: vmv1r.v v0, v28 +; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 73 +; RV32-NEXT: li a3, 57 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v16, v24, v8, v0.t ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 2 -; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v28, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v16, v28, v0.t +; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 13 +; RV32-NEXT: li a3, 49 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; RV32-NEXT: lui a1, %hi(.LCPI8_10) -; RV32-NEXT: addi a1, a1, %lo(.LCPI8_10) +; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; RV32-NEXT: vle16.v v12, (a1) -; RV32-NEXT: lui a1, 15 -; RV32-NEXT: vmv.s.x v3, a1 +; RV32-NEXT: vrgatherei16.vv v4, v0, v12 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 57 +; RV32-NEXT: li a3, 25 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vslideup.vi v8, v16, 6 -; RV32-NEXT: vmv1r.v v0, v3 -; RV32-NEXT: vrgatherei16.vv v8, v24, v12, v0.t +; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 57 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a3, a1, 5 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill -; RV32-NEXT: lui a1, %hi(.LCPI8_11) -; RV32-NEXT: addi a1, a1, %lo(.LCPI8_11) -; RV32-NEXT: lui a3, %hi(.LCPI8_12) -; RV32-NEXT: addi a3, a3, %lo(.LCPI8_12) +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vslideup.vi v4, v8, 4, v0.t +; RV32-NEXT: lui a1, 768 +; RV32-NEXT: lui a3, %hi(.LCPI8_6) +; RV32-NEXT: addi a3, a3, %lo(.LCPI8_6) +; RV32-NEXT: li a4, 1008 +; RV32-NEXT: addi a1, a1, 768 ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu -; RV32-NEXT: vle16.v v28, (a1) -; RV32-NEXT: vle16.v v4, (a3) -; RV32-NEXT: li a1, 1008 -; RV32-NEXT: vmv.s.x v0, a1 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 2 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vle16.v v8, (a3) +; RV32-NEXT: vmv.s.x v1, a4 +; RV32-NEXT: vmv.s.x v12, a1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 6 -; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: li a3, 41 +; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v16, v28 +; RV32-NEXT: vcompress.vm v24, v16, v12 +; RV32-NEXT: vmv1r.v v0, v1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 73 +; RV32-NEXT: li a3, 57 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v16, v4, v0.t +; RV32-NEXT: vrgatherei16.vv v24, v16, v8, v0.t ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 2 -; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: li a3, 25 +; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; RV32-NEXT: lui a1, %hi(.LCPI8_13) -; RV32-NEXT: addi a1, a1, %lo(.LCPI8_13) -; RV32-NEXT: lui a3, %hi(.LCPI8_14) -; RV32-NEXT: addi a3, a3, %lo(.LCPI8_14) -; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; RV32-NEXT: lui a1, %hi(.LCPI8_7) +; RV32-NEXT: addi a1, a1, %lo(.LCPI8_7) +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; RV32-NEXT: vle16.v v8, (a1) -; RV32-NEXT: lui a1, %hi(.LCPI8_15) -; RV32-NEXT: addi a1, a1, %lo(.LCPI8_15) -; RV32-NEXT: vsetvli zero, a2, e16, m4, ta, ma -; RV32-NEXT: vle16.v v28, (a3) -; RV32-NEXT: vle16.v v12, (a1) +; RV32-NEXT: lui a1, 15 +; RV32-NEXT: vmv.s.x v0, a1 ; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vmv1r.v v0, v3 +; RV32-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 37 +; RV32-NEXT: li a3, 49 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; RV32-NEXT: vrgatherei16.vv v16, v24, v8, v0.t +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vslideup.vi v20, v16, 6 ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a3, a1, 5 ; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v20, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v20, v24, v8, v0.t +; RV32-NEXT: lui a1, 3073 +; RV32-NEXT: lui a3, %hi(.LCPI8_8) +; RV32-NEXT: addi a3, a3, %lo(.LCPI8_8) +; RV32-NEXT: lui a4, %hi(.LCPI8_9) +; RV32-NEXT: addi a4, a4, %lo(.LCPI8_9) +; RV32-NEXT: addi a1, a1, -1024 +; RV32-NEXT: vsetvli zero, a2, e16, m4, ta, ma +; RV32-NEXT: vle16.v v16, (a3) +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-NEXT: vle16.v v2, (a4) +; RV32-NEXT: vmv.s.x v0, a1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 25 +; RV32-NEXT: li a3, 41 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v20, v8 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a3, a1, 6 -; RV32-NEXT: add a1, a3, a1 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu -; RV32-NEXT: vrgatherei16.vv v8, v0, v28 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 2 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vcompress.vm v8, v24, v0 +; RV32-NEXT: vmv1r.v v0, v1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 73 +; RV32-NEXT: li a2, 57 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v8, v24, v16, v0.t ; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vl4r.v v4, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v24, v4, v0.t +; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 21 -; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: slli a2, a1, 4 +; RV32-NEXT: add a1, a2, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v28, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 13 -; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: slli a2, a1, 5 +; RV32-NEXT: add a1, a2, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v28, v0 +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; RV32-NEXT: vrgatherei16.vv v12, v24, v2, v0.t ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 57 -; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma +; RV32-NEXT: vmv.v.v v4, v24 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a2, a1, 2 -; RV32-NEXT: add a1, a2, a1 +; RV32-NEXT: li a2, 25 +; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vmv.v.v v24, v0 -; RV32-NEXT: vmv.v.v v16, v8 +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vmv.v.v v20, v24 +; RV32-NEXT: vmv.v.v v12, v8 ; RV32-NEXT: addi a1, a0, 320 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vse32.v v16, (a1) +; RV32-NEXT: vse32.v v12, (a1) ; RV32-NEXT: addi a1, a0, 256 -; RV32-NEXT: vse32.v v24, (a1) +; RV32-NEXT: vse32.v v20, (a1) ; RV32-NEXT: addi a1, a0, 192 -; RV32-NEXT: vse32.v v28, (a1) +; RV32-NEXT: vse32.v v4, (a1) ; RV32-NEXT: addi a1, a0, 128 -; RV32-NEXT: vse32.v v20, (a1) +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a3, a2, 3 +; RV32-NEXT: add a2, a3, a2 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vse32.v v8, (a1) ; RV32-NEXT: addi a1, a0, 64 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 53 +; RV32-NEXT: li a3, 13 ; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 16 ; RV32-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vse32.v v8, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 41 +; RV32-NEXT: li a2, 21 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vse32.v v8, (a0) ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 81 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a1, a0, 6 +; RV32-NEXT: add a0, a1, a0 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: .cfi_def_cfa sp, 16 ; RV32-NEXT: addi sp, sp, 16 @@ -651,366 +594,457 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: addi sp, sp, -16 ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: slli a2, a2, 6 +; RV64-NEXT: li a3, 67 +; RV64-NEXT: mul a2, a2, a3 ; RV64-NEXT: sub sp, sp, a2 -; RV64-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 64 * vlenb +; RV64-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc3, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 67 * vlenb ; RV64-NEXT: addi a2, a1, 128 ; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV64-NEXT: vle64.v v8, (a1) ; RV64-NEXT: csrr a3, vlenb -; RV64-NEXT: li a4, 48 +; RV64-NEXT: li a4, 59 ; RV64-NEXT: mul a3, a3, a4 ; RV64-NEXT: add a3, sp, a3 ; RV64-NEXT: addi a3, a3, 16 ; RV64-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV64-NEXT: addi a1, a1, 256 -; RV64-NEXT: vle64.v v16, (a1) -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 20 -; RV64-NEXT: mul a1, a1, a3 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV64-NEXT: li a1, 128 -; RV64-NEXT: vid.v v10 -; RV64-NEXT: vmv.s.x v1, a1 -; RV64-NEXT: li a1, 6 -; RV64-NEXT: vmul.vx v2, v10, a1 -; RV64-NEXT: li a1, 56 -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV64-NEXT: vrgather.vi v12, v16, 4 -; RV64-NEXT: vsetivli zero, 8, e64, m8, ta, ma -; RV64-NEXT: vslidedown.vi v16, v16, 8 +; RV64-NEXT: li a3, 128 +; RV64-NEXT: vle64.v v24, (a1) +; RV64-NEXT: lui a1, 1 +; RV64-NEXT: vid.v v8 +; RV64-NEXT: vmv.s.x v0, a3 ; RV64-NEXT: csrr a3, vlenb -; RV64-NEXT: li a4, 36 +; RV64-NEXT: li a4, 30 ; RV64-NEXT: mul a3, a3, a4 ; RV64-NEXT: add a3, sp, a3 ; RV64-NEXT: addi a3, a3, 16 -; RV64-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV64-NEXT: vmv1r.v v0, v1 +; RV64-NEXT: vs1r.v v0, (a3) # Unknown-size Folded Spill +; RV64-NEXT: li a3, 6 +; RV64-NEXT: vmul.vx v6, v8, a3 +; RV64-NEXT: li a3, 56 +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64-NEXT: vrgather.vi v8, v24, 4 +; RV64-NEXT: csrr a4, vlenb +; RV64-NEXT: li a5, 22 +; RV64-NEXT: mul a4, a4, a5 +; RV64-NEXT: add a4, sp, a4 +; RV64-NEXT: addi a4, a4, 16 +; RV64-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill +; RV64-NEXT: vsetivli zero, 8, e64, m8, ta, ma +; RV64-NEXT: vslidedown.vi v16, v24, 8 +; RV64-NEXT: csrr a4, vlenb +; RV64-NEXT: li a5, 39 +; RV64-NEXT: mul a4, a4, a5 +; RV64-NEXT: add a4, sp, a4 +; RV64-NEXT: addi a4, a4, 16 +; RV64-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vrgather.vi v12, v16, 2, v0.t +; RV64-NEXT: vrgather.vi v8, v16, 2, v0.t +; RV64-NEXT: vmv.v.v v20, v8 +; RV64-NEXT: vmv.s.x v8, a3 +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: li a4, 55 +; RV64-NEXT: mul a3, a3, a4 +; RV64-NEXT: add a3, sp, a3 +; RV64-NEXT: addi a3, a3, 16 +; RV64-NEXT: vs1r.v v8, (a3) # Unknown-size Folded Spill +; RV64-NEXT: addi a3, a1, 65 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vle64.v v16, (a2) +; RV64-NEXT: vle64.v v8, (a2) ; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: li a3, 56 +; RV64-NEXT: li a4, 47 +; RV64-NEXT: mul a2, a2, a4 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; RV64-NEXT: vmv.s.x v16, a3 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 35 ; RV64-NEXT: mul a2, a2, a3 ; RV64-NEXT: add a2, sp, a2 ; RV64-NEXT: addi a2, a2, 16 -; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; RV64-NEXT: vmv.s.x v7, a1 +; RV64-NEXT: vs1r.v v16, (a2) # Unknown-size Folded Spill ; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV64-NEXT: vadd.vi v10, v2, -16 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 48 -; RV64-NEXT: mul a1, a1, a2 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vadd.vi v16, v6, -16 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a3, a2, 5 +; RV64-NEXT: sub a2, a3, a2 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill +; RV64-NEXT: vmv2r.v v18, v6 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 12 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs2r.v v6, (a2) # Unknown-size Folded Spill +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 59 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl8r.v v0, (a2) # Unknown-size Folded Reload +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 35 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl1r.v v16, (a2) # Unknown-size Folded Reload ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; RV64-NEXT: vrgatherei16.vv v24, v16, v2 -; RV64-NEXT: vmv1r.v v0, v7 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 56 -; RV64-NEXT: mul a1, a1, a2 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vrgatherei16.vv v24, v16, v10, v0.t +; RV64-NEXT: vcompress.vm v24, v0, v16 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 55 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl1r.v v0, (a2) # Unknown-size Folded Reload +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a3, a2, 5 +; RV64-NEXT: sub a2, a3, a2 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl2r.v v16, (a2) # Unknown-size Folded Reload +; RV64-NEXT: vrgatherei16.vv v24, v8, v16, v0.t ; RV64-NEXT: vsetivli zero, 6, e64, m4, tu, ma -; RV64-NEXT: vmv.v.v v12, v24 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 4 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 20 -; RV64-NEXT: mul a1, a1, a2 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vmv.v.v v20, v24 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 18 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs4r.v v20, (a2) # Unknown-size Folded Spill +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 22 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vrgather.vi v12, v16, 5 -; RV64-NEXT: vmv1r.v v0, v1 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 36 -; RV64-NEXT: mul a1, a1, a2 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vrgather.vi v12, v16, 3, v0.t -; RV64-NEXT: vmv.v.v v20, v12 +; RV64-NEXT: vrgather.vi v8, v24, 5 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 30 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl1r.v v0, (a2) # Unknown-size Folded Reload +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 39 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; RV64-NEXT: vrgather.vi v8, v24, 3, v0.t +; RV64-NEXT: vmv.v.v v20, v8 +; RV64-NEXT: lui a2, 2 +; RV64-NEXT: addi a2, a2, 130 +; RV64-NEXT: vmv.s.x v8, a2 ; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV64-NEXT: vadd.vi v16, v2, 1 -; RV64-NEXT: vadd.vi v18, v2, -15 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 48 -; RV64-NEXT: mul a1, a1, a2 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vadd.vi v16, v18, -15 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 59 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl8r.v v0, (a2) # Unknown-size Folded Reload ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; RV64-NEXT: vrgatherei16.vv v24, v8, v16 -; RV64-NEXT: vmv1r.v v0, v7 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 56 -; RV64-NEXT: mul a1, a1, a2 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vrgatherei16.vv v24, v8, v18, v0.t +; RV64-NEXT: vcompress.vm v24, v0, v8 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 55 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl1r.v v0, (a2) # Unknown-size Folded Reload +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 47 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload +; RV64-NEXT: vrgatherei16.vv v24, v8, v16, v0.t ; RV64-NEXT: vsetivli zero, 6, e64, m4, tu, ma ; RV64-NEXT: vmv.v.v v20, v24 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 12 -; RV64-NEXT: mul a1, a1, a2 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs4r.v v20, (a1) # Unknown-size Folded Spill -; RV64-NEXT: lui a1, 16 -; RV64-NEXT: addi a1, a1, 7 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 14 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs4r.v v20, (a2) # Unknown-size Folded Spill +; RV64-NEXT: lui a2, 16 +; RV64-NEXT: addi a2, a2, 7 ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64-NEXT: vmv.v.i v9, 6 -; RV64-NEXT: vmv.v.x v10, a1 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 20 -; RV64-NEXT: mul a1, a1, a2 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vmv.v.i v8, 6 +; RV64-NEXT: vmv.v.x v9, a2 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 22 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64-NEXT: vrgatherei16.vv v12, v16, v8 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 55 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs4r.v v12, (a2) # Unknown-size Folded Spill ; RV64-NEXT: vrgatherei16.vv v12, v16, v9 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 44 -; RV64-NEXT: mul a1, a1, a2 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill -; RV64-NEXT: vrgatherei16.vv v12, v16, v10 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 3 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a3, a2, 5 +; RV64-NEXT: sub a2, a3, a2 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs4r.v v12, (a2) # Unknown-size Folded Spill +; RV64-NEXT: vmv4r.v v8, v16 ; RV64-NEXT: vrgather.vi v12, v16, 2 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 5 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 35 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs4r.v v12, (a2) # Unknown-size Folded Spill ; RV64-NEXT: vrgather.vi v12, v16, 3 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 28 -; RV64-NEXT: mul a1, a1, a2 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill -; RV64-NEXT: li a1, 24 -; RV64-NEXT: vmv.s.x v7, a1 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a2, a2, 3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs4r.v v12, (a2) # Unknown-size Folded Spill +; RV64-NEXT: lui a2, 4 +; RV64-NEXT: li a3, 24 +; RV64-NEXT: addi a2, a2, 260 +; RV64-NEXT: vmv.s.x v0, a3 +; RV64-NEXT: addi a3, sp, 16 +; RV64-NEXT: vs1r.v v0, (a3) # Unknown-size Folded Spill +; RV64-NEXT: vmv.s.x v24, a2 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 12 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl2r.v v2, (a2) # Unknown-size Folded Reload ; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV64-NEXT: vadd.vi v10, v2, 2 -; RV64-NEXT: vadd.vi v4, v2, -14 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 48 -; RV64-NEXT: mul a1, a1, a2 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vadd.vi v6, v2, -14 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 59 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; RV64-NEXT: vrgatherei16.vv v24, v16, v10 -; RV64-NEXT: vmv1r.v v0, v7 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 56 -; RV64-NEXT: mul a1, a1, a2 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vrgatherei16.vv v24, v8, v4, v0.t -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 20 -; RV64-NEXT: mul a1, a1, a2 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vcompress.vm v8, v16, v24 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 47 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; RV64-NEXT: vrgatherei16.vv v8, v16, v6, v0.t +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 22 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 30 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl1r.v v1, (a2) # Unknown-size Folded Reload ; RV64-NEXT: vmv1r.v v0, v1 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 36 -; RV64-NEXT: mul a1, a1, a2 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 44 -; RV64-NEXT: mul a1, a1, a2 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl4r.v v28, (a1) # Unknown-size Folded Reload +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 39 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 55 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl4r.v v28, (a2) # Unknown-size Folded Reload ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV64-NEXT: vrgather.vi v28, v24, 4, v0.t -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 44 -; RV64-NEXT: mul a1, a1, a2 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs4r.v v28, (a1) # Unknown-size Folded Spill +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 55 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs4r.v v28, (a2) # Unknown-size Folded Spill +; RV64-NEXT: lui a2, 8 +; RV64-NEXT: addi a2, a2, 520 +; RV64-NEXT: vmv.s.x v7, a2 ; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV64-NEXT: vadd.vi v30, v2, 3 -; RV64-NEXT: vadd.vi v28, v2, -13 +; RV64-NEXT: vadd.vi v4, v2, -13 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 59 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; RV64-NEXT: vrgatherei16.vv v8, v16, v30 -; RV64-NEXT: vmv1r.v v0, v7 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 56 -; RV64-NEXT: mul a1, a1, a2 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vrgatherei16.vv v8, v16, v28, v0.t -; RV64-NEXT: addi a1, sp, 16 -; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vcompress.vm v8, v24, v7 +; RV64-NEXT: addi a2, sp, 16 +; RV64-NEXT: vl1r.v v0, (a2) # Unknown-size Folded Reload +; RV64-NEXT: vrgatherei16.vv v8, v16, v4, v0.t +; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill ; RV64-NEXT: vmv1r.v v0, v1 -; RV64-NEXT: vmv4r.v v16, v24 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 3 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl4r.v v4, (a1) # Unknown-size Folded Reload +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 39 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a3, a2, 5 +; RV64-NEXT: sub a2, a3, a2 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vrgather.vi v4, v24, 5, v0.t -; RV64-NEXT: lui a1, 96 -; RV64-NEXT: li a2, 192 -; RV64-NEXT: vmv.s.x v8, a2 +; RV64-NEXT: vrgather.vi v8, v16, 5, v0.t +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a3, a2, 5 +; RV64-NEXT: sub a2, a3, a2 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs4r.v v8, (a2) # Unknown-size Folded Spill +; RV64-NEXT: lui a2, 96 +; RV64-NEXT: li a3, 192 +; RV64-NEXT: vmv.s.x v1, a3 ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64-NEXT: vmv.v.x v9, a1 -; RV64-NEXT: vmv1r.v v0, v8 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 5 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vmv.v.x v8, a2 +; RV64-NEXT: vmv1r.v v0, v1 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 35 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl4r.v v12, (a2) # Unknown-size Folded Reload ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vrgatherei16.vv v12, v24, v9, v0.t -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 5 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill -; RV64-NEXT: li a1, 28 -; RV64-NEXT: vmv.s.x v1, a1 +; RV64-NEXT: vrgatherei16.vv v12, v16, v8, v0.t +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 35 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs4r.v v12, (a2) # Unknown-size Folded Spill +; RV64-NEXT: li a2, 1040 +; RV64-NEXT: li a3, 28 +; RV64-NEXT: vmv.s.x v20, a2 +; RV64-NEXT: vmv.s.x v0, a3 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 30 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs1r.v v0, (a2) # Unknown-size Folded Spill ; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV64-NEXT: vadd.vi v10, v2, 4 -; RV64-NEXT: vadd.vi v12, v2, -12 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 48 -; RV64-NEXT: mul a1, a1, a2 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vadd.vi v22, v2, -12 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 59 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; RV64-NEXT: vrgatherei16.vv v16, v24, v10 -; RV64-NEXT: vmv1r.v v0, v1 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 56 -; RV64-NEXT: mul a1, a1, a2 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vrgatherei16.vv v16, v24, v12, v0.t -; RV64-NEXT: lui a1, 112 -; RV64-NEXT: addi a1, a1, 1 +; RV64-NEXT: vcompress.vm v8, v24, v20 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 47 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; RV64-NEXT: vrgatherei16.vv v8, v24, v22, v0.t +; RV64-NEXT: lui a2, 112 +; RV64-NEXT: addi a2, a2, 1 ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64-NEXT: vmv.v.x v9, a1 -; RV64-NEXT: vmv1r.v v0, v8 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 28 -; RV64-NEXT: mul a1, a1, a2 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 36 -; RV64-NEXT: mul a1, a1, a2 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vmv.v.x v12, a2 +; RV64-NEXT: vmv1r.v v0, v1 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a2, a2, 3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl4r.v v4, (a2) # Unknown-size Folded Reload ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vrgatherei16.vv v12, v24, v9, v0.t -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 28 -; RV64-NEXT: mul a1, a1, a2 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 44 -; RV64-NEXT: mul a1, a1, a2 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl4r.v v20, (a1) # Unknown-size Folded Reload -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 20 -; RV64-NEXT: mul a1, a1, a2 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vrgatherei16.vv v4, v16, v12, v0.t +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 55 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl4r.v v12, (a2) # Unknown-size Folded Reload +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 22 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload ; RV64-NEXT: vsetivli zero, 5, e64, m4, tu, ma -; RV64-NEXT: vmv.v.v v20, v8 +; RV64-NEXT: vmv.v.v v12, v16 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 55 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs4r.v v12, (a2) # Unknown-size Folded Spill +; RV64-NEXT: addi a1, a1, -2016 +; RV64-NEXT: vmv.s.x v12, a1 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 44 +; RV64-NEXT: li a2, 59 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs4r.v v20, (a1) # Unknown-size Folded Spill -; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV64-NEXT: vadd.vi v20, v2, 5 +; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-NEXT: vcompress.vm v16, v24, v12 +; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RV64-NEXT: vadd.vi v12, v2, -11 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 48 +; RV64-NEXT: li a2, 30 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; RV64-NEXT: vrgatherei16.vv v8, v24, v20 -; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV64-NEXT: vadd.vi v20, v2, -11 -; RV64-NEXT: vmv1r.v v0, v1 +; RV64-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 56 +; RV64-NEXT: li a2, 47 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; RV64-NEXT: vrgatherei16.vv v8, v24, v20, v0.t -; RV64-NEXT: vmv4r.v v12, v4 -; RV64-NEXT: addi a1, sp, 16 -; RV64-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vsetivli zero, 5, e64, m4, tu, ma -; RV64-NEXT: vmv.v.v v12, v0 +; RV64-NEXT: vrgatherei16.vv v16, v24, v12, v0.t ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 5 +; RV64-NEXT: slli a2, a1, 5 +; RV64-NEXT: sub a1, a2, a1 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl4r.v v20, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vmv.v.v v20, v16 +; RV64-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vsetivli zero, 5, e64, m4, tu, ma +; RV64-NEXT: vmv.v.v v12, v24 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 28 +; RV64-NEXT: li a2, 35 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl4r.v v16, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vmv.v.v v16, v8 -; RV64-NEXT: addi a1, a0, 320 -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV64-NEXT: vse64.v v16, (a1) +; RV64-NEXT: vl4r.v v20, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vmv.v.v v20, v8 +; RV64-NEXT: vmv4r.v v8, v4 +; RV64-NEXT: vmv.v.v v8, v16 ; RV64-NEXT: addi a1, a0, 256 +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV64-NEXT: vse64.v v20, (a1) +; RV64-NEXT: addi a1, a0, 320 +; RV64-NEXT: vse64.v v8, (a1) ; RV64-NEXT: addi a1, a0, 192 ; RV64-NEXT: vse64.v v12, (a1) ; RV64-NEXT: addi a1, a0, 128 ; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: li a3, 44 +; RV64-NEXT: li a3, 55 ; RV64-NEXT: mul a2, a2, a3 ; RV64-NEXT: add a2, sp, a2 ; RV64-NEXT: addi a2, a2, 16 @@ -1018,20 +1052,22 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: vse64.v v8, (a1) ; RV64-NEXT: addi a1, a0, 64 ; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: li a3, 12 +; RV64-NEXT: li a3, 14 ; RV64-NEXT: mul a2, a2, a3 ; RV64-NEXT: add a2, sp, a2 ; RV64-NEXT: addi a2, a2, 16 ; RV64-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload ; RV64-NEXT: vse64.v v8, (a1) ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 4 +; RV64-NEXT: li a2, 18 +; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vse64.v v8, (a0) ; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a0, a0, 6 +; RV64-NEXT: li a1, 67 +; RV64-NEXT: mul a0, a0, a1 ; RV64-NEXT: add sp, sp, a0 ; RV64-NEXT: .cfi_def_cfa sp, 16 ; RV64-NEXT: addi sp, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll index f04faf5cd2c54..27e66690d1b1e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll @@ -12,13 +12,15 @@ define void @deinterleave3_0_i8(ptr %in, ptr %out) { ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vid.v v9 +; CHECK-NEXT: vmv.v.i v9, -8 +; CHECK-NEXT: vid.v v10 ; CHECK-NEXT: li a0, 3 -; CHECK-NEXT: vmul.vx v9, v9, a0 +; CHECK-NEXT: vmadd.vx v10, a0, v9 +; CHECK-NEXT: li a0, 73 +; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: li a0, 56 ; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vadd.vi v10, v9, -8 -; CHECK-NEXT: vrgather.vv v11, v8, v9 +; CHECK-NEXT: vcompress.vm v11, v8, v9 ; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma ; CHECK-NEXT: vslidedown.vi v8, v8, 8 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu @@ -37,22 +39,20 @@ define void @deinterleave3_8_i8(ptr %in, ptr %out) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmv.v.i v9, 1 -; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: li a0, 3 -; CHECK-NEXT: vmadd.vx v10, a0, v9 +; CHECK-NEXT: li a0, 146 +; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: li a0, 24 -; CHECK-NEXT: vrgather.vv v9, v8, v10 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vcompress.vm v10, v8, v9 ; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma ; CHECK-NEXT: vslidedown.vi v8, v8, 8 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vsrl.vi v10, v8, 8 +; CHECK-NEXT: vsrl.vi v9, v8, 8 ; CHECK-NEXT: vsll.vi v8, v8, 8 ; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vor.vv v8, v8, v10 +; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: vse8.v v8, (a1) ; CHECK-NEXT: ret entry: @@ -100,15 +100,15 @@ define void @deinterleave4_8_i8(ptr %in, ptr %out) { ; CHECK-NEXT: vmv.v.i v9, -9 ; CHECK-NEXT: vid.v v10 ; CHECK-NEXT: li a0, 5 +; CHECK-NEXT: vmadd.vx v10, a0, v9 +; CHECK-NEXT: li a0, 34 ; CHECK-NEXT: vmv.v.i v0, 12 -; CHECK-NEXT: vmacc.vx v9, a0, v10 -; CHECK-NEXT: vsll.vi v10, v10, 2 -; CHECK-NEXT: vadd.vi v10, v10, 1 -; CHECK-NEXT: vrgather.vv v11, v8, v10 +; CHECK-NEXT: vmv.s.x v9, a0 +; CHECK-NEXT: vcompress.vm v11, v8, v9 ; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma ; CHECK-NEXT: vslidedown.vi v8, v8, 8 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vrgather.vv v11, v8, v9, v0.t +; CHECK-NEXT: vrgather.vv v11, v8, v10, v0.t ; CHECK-NEXT: vse8.v v11, (a1) ; CHECK-NEXT: ret entry: @@ -124,12 +124,14 @@ define void @deinterleave5_0_i8(ptr %in, ptr %out) { ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vid.v v9 +; CHECK-NEXT: vmv.v.i v9, -8 +; CHECK-NEXT: vid.v v10 ; CHECK-NEXT: li a0, 5 +; CHECK-NEXT: vmadd.vx v10, a0, v9 +; CHECK-NEXT: li a0, 33 ; CHECK-NEXT: vmv.v.i v0, 12 -; CHECK-NEXT: vmul.vx v9, v9, a0 -; CHECK-NEXT: vadd.vi v10, v9, -8 -; CHECK-NEXT: vrgather.vv v11, v8, v9 +; CHECK-NEXT: vmv.s.x v9, a0 +; CHECK-NEXT: vcompress.vm v11, v8, v9 ; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma ; CHECK-NEXT: vslidedown.vi v8, v8, 8 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu @@ -148,18 +150,16 @@ define void @deinterleave5_8_i8(ptr %in, ptr %out) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmv.v.i v9, 1 -; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: li a0, 5 -; CHECK-NEXT: vmadd.vx v10, a0, v9 +; CHECK-NEXT: li a0, 66 ; CHECK-NEXT: vmv.v.i v0, 4 -; CHECK-NEXT: vrgather.vv v9, v8, v10 +; CHECK-NEXT: vmv.s.x v9, a0 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vcompress.vm v10, v8, v9 ; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma ; CHECK-NEXT: vslidedown.vi v8, v8, 8 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vrgather.vi v9, v8, 3, v0.t -; CHECK-NEXT: vse8.v v9, (a1) +; CHECK-NEXT: vrgather.vi v10, v8, 3, v0.t +; CHECK-NEXT: vse8.v v10, (a1) ; CHECK-NEXT: ret entry: %0 = load <16 x i8>, ptr %in, align 1 @@ -173,12 +173,11 @@ define void @deinterleave6_0_i8(ptr %in, ptr %out) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vid.v v9 -; CHECK-NEXT: li a0, 6 +; CHECK-NEXT: li a0, 65 ; CHECK-NEXT: vmv.v.i v0, 4 -; CHECK-NEXT: vmul.vx v9, v9, a0 -; CHECK-NEXT: vrgather.vv v10, v8, v9 +; CHECK-NEXT: vmv.s.x v9, a0 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vcompress.vm v10, v8, v9 ; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma ; CHECK-NEXT: vslidedown.vi v8, v8, 8 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu @@ -197,18 +196,16 @@ define void @deinterleave6_8_i8(ptr %in, ptr %out) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmv.v.i v9, 1 -; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: li a0, 6 -; CHECK-NEXT: vmadd.vx v10, a0, v9 +; CHECK-NEXT: li a0, 130 ; CHECK-NEXT: vmv.v.i v0, 4 -; CHECK-NEXT: vrgather.vv v9, v8, v10 +; CHECK-NEXT: vmv.s.x v9, a0 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vcompress.vm v10, v8, v9 ; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma ; CHECK-NEXT: vslidedown.vi v8, v8, 8 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vrgather.vi v9, v8, 5, v0.t -; CHECK-NEXT: vse8.v v9, (a1) +; CHECK-NEXT: vrgather.vi v10, v8, 5, v0.t +; CHECK-NEXT: vse8.v v10, (a1) ; CHECK-NEXT: ret entry: %0 = load <16 x i8>, ptr %in, align 1 @@ -222,12 +219,11 @@ define void @deinterleave7_0_i8(ptr %in, ptr %out) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vid.v v9 -; CHECK-NEXT: li a0, 7 +; CHECK-NEXT: li a0, 129 ; CHECK-NEXT: vmv.v.i v0, 4 -; CHECK-NEXT: vmul.vx v9, v9, a0 -; CHECK-NEXT: vrgather.vv v10, v8, v9 +; CHECK-NEXT: vmv.s.x v9, a0 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vcompress.vm v10, v8, v9 ; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma ; CHECK-NEXT: vslidedown.vi v8, v8, 8 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll index 8f6240e112cdd..312520ae28374 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll @@ -473,17 +473,35 @@ entry: ; Can't match the m8 result type as the source would have to be m16 which ; isn't a legal type. define void @vnsrl_0_i32_single_src_m8(ptr %in, ptr %out) { -; CHECK-LABEL: vnsrl_0_i32_single_src_m8: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: li a2, 64 -; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vid.v v16 -; CHECK-NEXT: vadd.vv v16, v16, v16 -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vrgatherei16.vv v24, v8, v16 -; CHECK-NEXT: vse32.v v24, (a1) -; CHECK-NEXT: ret +; V-LABEL: vnsrl_0_i32_single_src_m8: +; V: # %bb.0: # %entry +; V-NEXT: li a2, 64 +; V-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; V-NEXT: vle32.v v8, (a0) +; V-NEXT: lui a0, 341 +; V-NEXT: addiw a0, a0, 1365 +; V-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; V-NEXT: vmv.s.x v16, a0 +; V-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; V-NEXT: vcompress.vm v24, v8, v16 +; V-NEXT: vse32.v v24, (a1) +; V-NEXT: ret +; +; ZVE32F-LABEL: vnsrl_0_i32_single_src_m8: +; ZVE32F: # %bb.0: # %entry +; ZVE32F-NEXT: li a2, 64 +; ZVE32F-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; ZVE32F-NEXT: vle32.v v8, (a0) +; ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma +; ZVE32F-NEXT: vmv.v.i v16, 0 +; ZVE32F-NEXT: lui a0, 341 +; ZVE32F-NEXT: addi a0, a0, 1365 +; ZVE32F-NEXT: vsetvli zero, zero, e32, m1, tu, ma +; ZVE32F-NEXT: vmv.s.x v16, a0 +; ZVE32F-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; ZVE32F-NEXT: vcompress.vm v24, v8, v16 +; ZVE32F-NEXT: vse32.v v24, (a1) +; ZVE32F-NEXT: ret entry: %0 = load <64 x i32>, ptr %in, align 4 %shuffle.i5 = shufflevector <64 x i32> %0, <64 x i32> poison, <64 x i32> diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll index 54d2f3f68989b..41cf886c3ab75 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll @@ -95,26 +95,25 @@ define {<4 x i64>, <4 x i64>} @vector_deinterleave_v4i64_v8i64(<8 x i64> %vec) { ; CHECK-LABEL: vector_deinterleave_v4i64_v8i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: vmv.v.i v14, 5 +; CHECK-NEXT: vid.v v15 ; CHECK-NEXT: vsetivli zero, 4, e64, m4, ta, ma ; CHECK-NEXT: vslidedown.vi v16, v8, 4 -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma ; CHECK-NEXT: vmv.v.i v0, 12 -; CHECK-NEXT: vadd.vv v14, v12, v12 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-NEXT: vrgatherei16.vv v12, v8, v14 -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-NEXT: vadd.vi v10, v14, -4 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vrgatherei16.vv v12, v16, v10, v0.t +; CHECK-NEXT: vmv.v.i v18, 10 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vcompress.vm v12, v8, v14 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-NEXT: vadd.vi v15, v14, 1 +; CHECK-NEXT: vadd.vv v14, v15, v15 ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-NEXT: vrgatherei16.vv v10, v8, v15 +; CHECK-NEXT: vcompress.vm v10, v8, v18 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-NEXT: vadd.vi v8, v14, -3 +; CHECK-NEXT: vadd.vi v8, v14, -4 +; CHECK-NEXT: vadd.vi v9, v14, -3 ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vrgatherei16.vv v10, v16, v8, v0.t +; CHECK-NEXT: vrgatherei16.vv v12, v16, v8, v0.t +; CHECK-NEXT: vrgatherei16.vv v10, v16, v9, v0.t ; CHECK-NEXT: vmv.v.v v8, v12 ; CHECK-NEXT: ret %retval = call {<4 x i64>, <4 x i64>} @llvm.vector.deinterleave2.v8i64(<8 x i64> %vec) @@ -124,26 +123,27 @@ define {<4 x i64>, <4 x i64>} @vector_deinterleave_v4i64_v8i64(<8 x i64> %vec) { define {<8 x i64>, <8 x i64>} @vector_deinterleave_v8i64_v16i64(<16 x i64> %vec) { ; CHECK-LABEL: vector_deinterleave_v8i64_v16i64: ; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 85 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vid.v v16 ; CHECK-NEXT: vmv.v.i v0, -16 +; CHECK-NEXT: vid.v v16 ; CHECK-NEXT: vsetivli zero, 8, e64, m8, ta, ma ; CHECK-NEXT: vslidedown.vi v24, v8, 8 +; CHECK-NEXT: vmv.s.x v12, a0 +; CHECK-NEXT: li a0, 170 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vadd.vv v20, v16, v16 +; CHECK-NEXT: vmv.s.x v21, a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-NEXT: vrgatherei16.vv v16, v8, v20 +; CHECK-NEXT: vcompress.vm v16, v8, v12 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; CHECK-NEXT: vadd.vi v12, v20, -8 -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vrgatherei16.vv v16, v24, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; CHECK-NEXT: vadd.vi v21, v20, 1 +; CHECK-NEXT: vadd.vi v22, v20, -8 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-NEXT: vrgatherei16.vv v12, v8, v21 +; CHECK-NEXT: vcompress.vm v12, v8, v21 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vadd.vi v8, v20, -7 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; CHECK-NEXT: vrgatherei16.vv v16, v24, v22, v0.t ; CHECK-NEXT: vrgatherei16.vv v12, v24, v8, v0.t ; CHECK-NEXT: vmv.v.v v8, v16 ; CHECK-NEXT: ret @@ -241,26 +241,25 @@ define {<4 x double>, <4 x double>} @vector_deinterleave_v4f64_v8f64(<8 x double ; CHECK-LABEL: vector_deinterleave_v4f64_v8f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: vmv.v.i v14, 5 +; CHECK-NEXT: vid.v v15 ; CHECK-NEXT: vsetivli zero, 4, e64, m4, ta, ma ; CHECK-NEXT: vslidedown.vi v16, v8, 4 -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma ; CHECK-NEXT: vmv.v.i v0, 12 -; CHECK-NEXT: vadd.vv v14, v12, v12 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-NEXT: vrgatherei16.vv v12, v8, v14 -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-NEXT: vadd.vi v10, v14, -4 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vrgatherei16.vv v12, v16, v10, v0.t +; CHECK-NEXT: vmv.v.i v18, 10 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vcompress.vm v12, v8, v14 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-NEXT: vadd.vi v15, v14, 1 +; CHECK-NEXT: vadd.vv v14, v15, v15 ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-NEXT: vrgatherei16.vv v10, v8, v15 +; CHECK-NEXT: vcompress.vm v10, v8, v18 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-NEXT: vadd.vi v8, v14, -3 +; CHECK-NEXT: vadd.vi v8, v14, -4 +; CHECK-NEXT: vadd.vi v9, v14, -3 ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vrgatherei16.vv v10, v16, v8, v0.t +; CHECK-NEXT: vrgatherei16.vv v12, v16, v8, v0.t +; CHECK-NEXT: vrgatherei16.vv v10, v16, v9, v0.t ; CHECK-NEXT: vmv.v.v v8, v12 ; CHECK-NEXT: ret %retval = call {<4 x double>, <4 x double>} @llvm.vector.deinterleave2.v8f64(<8 x double> %vec)