diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h index b72efac0a4887..a903eaa6cbe54 100644 --- a/llvm/include/llvm/Analysis/VectorUtils.h +++ b/llvm/include/llvm/Analysis/VectorUtils.h @@ -235,10 +235,17 @@ void narrowShuffleMaskElts(int Scale, ArrayRef Mask, bool widenShuffleMaskElts(int Scale, ArrayRef Mask, SmallVectorImpl &ScaledMask); +/// A variant of the previous method which is specialized for Scale=2, and +/// treats -1 as undef and allows widening when a wider element is partially +/// undef in the narrow form of the mask. This transformation discards +/// information about which bytes in the original shuffle were undef. +bool widenShuffleMaskElts(ArrayRef M, SmallVectorImpl &NewMask); + /// Attempt to narrow/widen the \p Mask shuffle mask to the \p NumDstElts target /// width. Internally this will call narrowShuffleMaskElts/widenShuffleMaskElts. -/// This will assert unless NumDstElts is a multiple of Mask.size (or vice-versa). -/// Returns false on failure, and ScaledMask will be in an undefined state. +/// This will assert unless NumDstElts is a multiple of Mask.size (or +/// vice-versa). Returns false on failure, and ScaledMask will be in an +/// undefined state. bool scaleShuffleMaskElts(unsigned NumDstElts, ArrayRef Mask, SmallVectorImpl &ScaledMask); diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp index 6c2502ce21cca..b4b311cb727a1 100644 --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -479,6 +479,41 @@ bool llvm::widenShuffleMaskElts(int Scale, ArrayRef Mask, return true; } +bool llvm::widenShuffleMaskElts(ArrayRef M, + SmallVectorImpl &NewMask) { + unsigned NumElts = M.size(); + if (NumElts % 2 != 0) + return false; + + NewMask.clear(); + for (unsigned i = 0; i < NumElts; i += 2) { + int M0 = M[i]; + int M1 = M[i + 1]; + + // If both elements are undef, new mask is undef too. + if (M0 == -1 && M1 == -1) { + NewMask.push_back(-1); + continue; + } + + if (M0 == -1 && M1 != -1 && (M1 % 2) == 1) { + NewMask.push_back(M1 / 2); + continue; + } + + if (M0 != -1 && (M0 % 2) == 0 && ((M0 + 1) == M1 || M1 == -1)) { + NewMask.push_back(M0 / 2); + continue; + } + + NewMask.clear(); + return false; + } + + assert(NewMask.size() == NumElts / 2 && "Incorrect size for mask!"); + return true; +} + bool llvm::scaleShuffleMaskElts(unsigned NumDstElts, ArrayRef Mask, SmallVectorImpl &ScaledMask) { unsigned NumSrcElts = Mask.size(); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 3ad2905ce5207..797b150d70f83 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -13721,44 +13721,6 @@ static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT, return DAG.getNode(Opcode, dl, VT, V, DAG.getConstant(Lane, dl, MVT::i64)); } -// Return true if we can get a new shuffle mask by checking the parameter mask -// array to test whether every two adjacent mask values are continuous and -// starting from an even number. -static bool isWideTypeMask(ArrayRef M, EVT VT, - SmallVectorImpl &NewMask) { - unsigned NumElts = VT.getVectorNumElements(); - if (NumElts % 2 != 0) - return false; - - NewMask.clear(); - for (unsigned i = 0; i < NumElts; i += 2) { - int M0 = M[i]; - int M1 = M[i + 1]; - - // If both elements are undef, new mask is undef too. - if (M0 == -1 && M1 == -1) { - NewMask.push_back(-1); - continue; - } - - if (M0 == -1 && M1 != -1 && (M1 % 2) == 1) { - NewMask.push_back(M1 / 2); - continue; - } - - if (M0 != -1 && (M0 % 2) == 0 && ((M0 + 1) == M1 || M1 == -1)) { - NewMask.push_back(M0 / 2); - continue; - } - - NewMask.clear(); - return false; - } - - assert(NewMask.size() == NumElts / 2 && "Incorrect size for mask!"); - return true; -} - // Try to widen element type to get a new mask value for a better permutation // sequence, so that we can use NEON shuffle instructions, such as zip1/2, // UZP1/2, TRN1/2, REV, INS, etc. @@ -13785,7 +13747,7 @@ static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG) { return SDValue(); SmallVector NewMask; - if (isWideTypeMask(Mask, VT, NewMask)) { + if (widenShuffleMaskElts(Mask, NewMask)) { MVT NewEltVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(ElementSize * 2) : MVT::getIntegerVT(ElementSize * 2); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 2eeca45ac414b..eafc8ebc0d609 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -5261,6 +5261,39 @@ static SDValue lowerDisjointIndicesShuffle(ShuffleVectorSDNode *SVN, return DAG.getVectorShuffle(VT, DL, Select, DAG.getUNDEF(VT), NewMask); } +/// Try to widen element type to get a new mask value for a better permutation +/// sequence. This doesn't try to inspect the widened mask for profitability; +/// we speculate the widened form is equal or better. This has the effect of +/// reducing mask constant sizes - allowing cheaper materialization sequences +/// - and index sequence sizes - reducing register pressure and materialization +/// cost, at the cost of (possibly) an extra VTYPE toggle. +static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG) { + SDLoc DL(Op); + MVT VT = Op.getSimpleValueType(); + MVT ScalarVT = VT.getVectorElementType(); + unsigned ElementSize = ScalarVT.getFixedSizeInBits(); + SDValue V0 = Op.getOperand(0); + SDValue V1 = Op.getOperand(1); + ArrayRef Mask = cast(Op)->getMask(); + + // Avoid wasted work leading to isTypeLegal check failing below + if (ElementSize > 32) + return SDValue(); + + SmallVector NewMask; + if (!widenShuffleMaskElts(Mask, NewMask)) + return SDValue(); + + MVT NewEltVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(ElementSize * 2) + : MVT::getIntegerVT(ElementSize * 2); + MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2); + if (!DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) + return SDValue(); + V0 = DAG.getBitcast(NewVT, V0); + V1 = DAG.getBitcast(NewVT, V1); + return DAG.getBitcast(VT, DAG.getVectorShuffle(NewVT, DL, V0, V1, NewMask)); +} + static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const RISCVSubtarget &Subtarget) { SDValue V1 = Op.getOperand(0); @@ -5506,6 +5539,11 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, if (SDValue V = lowerVECTOR_SHUFFLEAsRotate(SVN, DAG, Subtarget)) return V; + // Before hitting generic lowering fallbacks, try to widen the mask + // to a wider SEW. + if (SDValue V = tryWidenMaskForShuffle(Op, DAG)) + return V; + // Can we generate a vcompress instead of a vrgather? These scale better // at high LMUL, at the cost of not being able to fold a following select // into them. The mask constants are also smaller than the index vector @@ -5615,6 +5653,11 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, if (SDValue V = lowerDisjointIndicesShuffle(SVN, DAG, Subtarget)) return V; + // Before hitting generic lowering fallbacks, try to widen the mask + // to a wider SEW. + if (SDValue V = tryWidenMaskForShuffle(Op, DAG)) + return V; + // Try to pick a profitable operand order. bool SwapOps = DAG.isSplatValue(V2) && !DAG.isSplatValue(V1); SwapOps = SwapOps ^ ShuffleVectorInst::isIdentityMask(ShuffleMaskRHS, NumElts); diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll index 0bd8466669dc8..00138744a7bda 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll @@ -603,10 +603,8 @@ define <8 x i8> @concat_4xi8_start_undef(<8 x i8> %v, <8 x i8> %w) { define <8 x i8> @concat_4xi8_start_undef_at_start(<8 x i8> %v, <8 x i8> %w) { ; CHECK-LABEL: concat_4xi8_start_undef_at_start: ; CHECK: # %bb.0: -; CHECK-NEXT: li a0, -32 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vslideup.vi v8, v9, 4, v0.t +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v9, 2 ; CHECK-NEXT: ret %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> ret <8 x i8> %res @@ -704,8 +702,8 @@ define <8 x i32> @shuffle_v8i32_2(<8 x i32> %x, <8 x i32> %y) { ; CHECK-LABEL: shuffle_v8i32_2: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; CHECK-NEXT: vmv.v.i v0, -13 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vmv.v.i v0, 13 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %s = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> @@ -756,9 +754,9 @@ define <8 x i16> @shuffle_compress_singlesrc_e16(<8 x i16> %v) { define <8 x i32> @shuffle_compress_singlesrc_e32(<8 x i32> %v) { ; CHECK-LABEL: shuffle_compress_singlesrc_e32: ; CHECK: # %bb.0: -; CHECK-NEXT: li a0, 115 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vmv.s.x v12, a0 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; CHECK-NEXT: vmv.v.i v12, 13 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vcompress.vm v10, v8, v12 ; CHECK-NEXT: vmv.v.v v8, v10 ; CHECK-NEXT: ret @@ -832,26 +830,16 @@ define <8 x i32> @shuffle_spread2_singlesrc_e32_index2(<8 x i32> %v) { } define <8 x i32> @shuffle_spread3_singlesrc_e32(<8 x i32> %v) { -; RV32-LABEL: shuffle_spread3_singlesrc_e32: -; RV32: # %bb.0: -; RV32-NEXT: lui a0, %hi(.LCPI57_0) -; RV32-NEXT: addi a0, a0, %lo(.LCPI57_0) -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vle16.v v12, (a0) -; RV32-NEXT: vrgatherei16.vv v10, v8, v12 -; RV32-NEXT: vmv.v.v v8, v10 -; RV32-NEXT: ret -; -; RV64-LABEL: shuffle_spread3_singlesrc_e32: -; RV64: # %bb.0: -; RV64-NEXT: lui a0, 32769 -; RV64-NEXT: slli a0, a0, 21 -; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV64-NEXT: vmv.v.x v12, a0 -; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV64-NEXT: vrgatherei16.vv v10, v8, v12 -; RV64-NEXT: vmv.v.v v8, v10 -; RV64-NEXT: ret +; CHECK-LABEL: shuffle_spread3_singlesrc_e32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: li a0, 1 +; CHECK-NEXT: vslide1down.vx v12, v10, a0 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vrgatherei16.vv v10, v8, v12 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: ret %out = shufflevector <8 x i32> %v, <8 x i32> poison, <8 x i32> ret <8 x i32> %out } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll index 67d649902b022..0c7d7925edf39 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll @@ -183,498 +183,456 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 96 +; RV32-NEXT: li a3, 92 ; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: sub sp, sp, a2 -; RV32-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xe0, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 96 * vlenb -; RV32-NEXT: addi a3, a1, 128 -; RV32-NEXT: addi a4, a1, 256 +; RV32-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xdc, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 92 * vlenb +; RV32-NEXT: addi a3, a1, 256 +; RV32-NEXT: addi a4, a1, 128 ; RV32-NEXT: li a2, 32 -; RV32-NEXT: li a5, 768 -; RV32-NEXT: lui a6, 12291 -; RV32-NEXT: lui a7, %hi(.LCPI8_1) -; RV32-NEXT: addi a7, a7, %lo(.LCPI8_1) +; RV32-NEXT: lui a5, 12291 +; RV32-NEXT: lui a6, %hi(.LCPI8_0) +; RV32-NEXT: addi a6, a6, %lo(.LCPI8_0) +; RV32-NEXT: li a7, 768 +; RV32-NEXT: lui t0, 49164 ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; RV32-NEXT: vle32.v v16, (a1) +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li t1, 76 +; RV32-NEXT: mul a1, a1, t1 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vle32.v v8, (a4) -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li t0, 88 -; RV32-NEXT: mul a4, a4, t0 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vmv.s.x v0, a5 -; RV32-NEXT: vle32.v v24, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a4, 72 +; RV32-NEXT: li a4, 68 ; RV32-NEXT: mul a1, a1, a4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: addi a5, a5, 3 +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-NEXT: vle16.v v6, (a6) +; RV32-NEXT: vmv.s.x v0, a5 +; RV32-NEXT: lui a1, %hi(.LCPI8_1) +; RV32-NEXT: addi a1, a1, %lo(.LCPI8_1) +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; RV32-NEXT: vmerge.vvm v16, v8, v16, v0 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vrgatherei16.vv v24, v16, v6 +; RV32-NEXT: csrr a4, vlenb +; RV32-NEXT: li a5, 52 +; RV32-NEXT: mul a4, a4, a5 +; RV32-NEXT: add a4, sp, a4 +; RV32-NEXT: addi a4, a4, 16 +; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vle32.v v16, (a3) +; RV32-NEXT: addi t0, t0, 12 +; RV32-NEXT: vmv.s.x v0, a7 +; RV32-NEXT: vmv.s.x v7, t0 +; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; RV32-NEXT: vle16.v v4, (a1) +; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; RV32-NEXT: vslidedown.vi v24, v16, 16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 6 +; RV32-NEXT: li a3, 60 +; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV32-NEXT: addi a6, a6, 3 -; RV32-NEXT: vle16.v v4, (a7) -; RV32-NEXT: vmv.s.x v3, a6 -; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV32-NEXT: vslidedown.vi v16, v8, 16 +; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 80 +; RV32-NEXT: li a3, 84 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmerge.vvm v8, v16, v8, v0 +; RV32-NEXT: vmerge.vvm v20, v24, v16, v0 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 52 +; RV32-NEXT: li a3, 40 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vmv1r.v v0, v3 +; RV32-NEXT: vs4r.v v20, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vmv1r.v v0, v7 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 6 +; RV32-NEXT: li a3, 76 +; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmerge.vvm v16, v8, v24, v0 -; RV32-NEXT: vrgatherei16.vv v8, v16, v4 +; RV32-NEXT: vmerge.vvm v24, v8, v16, v0 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vrgatherei16.vv v8, v24, v4 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 56 +; RV32-NEXT: li a3, 44 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: li a1, 3 -; RV32-NEXT: lui a3, 49164 -; RV32-NEXT: lui a4, %hi(.LCPI8_3) -; RV32-NEXT: addi a4, a4, %lo(.LCPI8_3) +; RV32-NEXT: lui a3, 196656 +; RV32-NEXT: lui a4, %hi(.LCPI8_2) +; RV32-NEXT: addi a4, a4, %lo(.LCPI8_2) ; RV32-NEXT: slli a1, a1, 10 -; RV32-NEXT: addi a3, a3, 12 +; RV32-NEXT: addi a3, a3, 48 ; RV32-NEXT: vmv.s.x v0, a1 -; RV32-NEXT: vle16.v v16, (a4) -; RV32-NEXT: vmv.s.x v20, a3 +; RV32-NEXT: vle16.v v14, (a4) +; RV32-NEXT: vmv.s.x v12, a3 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 88 +; RV32-NEXT: li a3, 84 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vmv4r.v v8, v24 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 80 +; RV32-NEXT: li a3, 60 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmerge.vvm v8, v8, v24, v0 +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; RV32-NEXT: vmerge.vvm v8, v24, v8, v0 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 48 +; RV32-NEXT: li a3, 24 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vmv1r.v v0, v20 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 6 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vmv1r.v v0, v12 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 72 +; RV32-NEXT: li a3, 68 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmerge.vvm v24, v8, v24, v0 -; RV32-NEXT: vrgatherei16.vv v8, v24, v16 +; RV32-NEXT: vmerge.vvm v24, v24, v16, v0 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vrgatherei16.vv v16, v24, v14 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 40 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a1, a1, 5 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: lui a1, 3 -; RV32-NEXT: lui a3, 196656 +; RV32-NEXT: lui a3, 786624 ; RV32-NEXT: lui a4, 12 -; RV32-NEXT: lui a5, 786624 +; RV32-NEXT: lui a5, 768 ; RV32-NEXT: li a6, 48 -; RV32-NEXT: lui a7, 768 +; RV32-NEXT: lui a7, 3073 ; RV32-NEXT: li t0, 192 ; RV32-NEXT: addi a1, a1, 3 -; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: addi a3, a3, 192 ; RV32-NEXT: addi a4, a4, 12 -; RV32-NEXT: addi a5, a5, 192 -; RV32-NEXT: addi a7, a7, 768 +; RV32-NEXT: addi a5, a5, 768 +; RV32-NEXT: addi a7, a7, -1024 ; RV32-NEXT: vmv.s.x v1, a6 -; RV32-NEXT: vmv.s.x v8, t0 -; RV32-NEXT: addi a6, sp, 16 -; RV32-NEXT: vs1r.v v8, (a6) # Unknown-size Folded Spill +; RV32-NEXT: vmv.s.x v12, t0 ; RV32-NEXT: vmv.s.x v0, a1 -; RV32-NEXT: vmv.s.x v14, a3 -; RV32-NEXT: vmv.s.x v7, a4 -; RV32-NEXT: vmv.s.x v3, a5 -; RV32-NEXT: vmv.s.x v2, a7 +; RV32-NEXT: vmv.s.x v3, a3 +; RV32-NEXT: vmv.s.x v2, a4 +; RV32-NEXT: vmv.s.x v13, a5 +; RV32-NEXT: vmv.s.x v14, a7 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 80 +; RV32-NEXT: li a3, 60 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vmv4r.v v8, v24 +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vmv4r.v v8, v16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 88 +; RV32-NEXT: li a3, 84 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmerge.vvm v8, v8, v24, v0 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vmv1r.v v0, v14 +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; RV32-NEXT: vmerge.vvm v20, v8, v16, v0 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vs4r.v v20, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vmv1r.v v0, v3 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 72 +; RV32-NEXT: li a3, 68 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 6 +; RV32-NEXT: li a3, 76 +; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmerge.vvm v16, v8, v16, v0 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vmv1r.v v0, v7 +; RV32-NEXT: vmerge.vvm v24, v16, v24, v0 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 88 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a1, a1, 4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vmv1r.v v0, v2 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 80 +; RV32-NEXT: li a3, 84 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmerge.vvm v4, v24, v16, v0 -; RV32-NEXT: vmv1r.v v0, v3 +; RV32-NEXT: vmerge.vvm v24, v8, v24, v0 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 72 +; RV32-NEXT: li a3, 12 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmerge.vvm v16, v8, v16, v0 +; RV32-NEXT: vs4r.v v24, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vmv1r.v v0, v13 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 28 +; RV32-NEXT: li a3, 76 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vmv1r.v v0, v1 +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; RV32-NEXT: vmerge.vvm v24, v16, v24, v0 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 88 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a1, a1, 2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vmv1r.v v0, v1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 80 +; RV32-NEXT: li a3, 84 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmerge.vvm v16, v24, v16, v0 +; RV32-NEXT: vmerge.vvm v4, v8, v24, v0 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 36 +; RV32-NEXT: li a3, 28 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v16, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vmv1r.v v0, v2 +; RV32-NEXT: vs4r.v v4, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vmv1r.v v0, v14 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 72 +; RV32-NEXT: li a3, 76 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmerge.vvm v16, v8, v16, v0 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 20 +; RV32-NEXT: li a3, 68 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; RV32-NEXT: vmerge.vvm v16, v24, v16, v0 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 88 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: li a2, 76 +; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vmv1r.v v0, v12 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 80 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: li a2, 84 +; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmerge.vvm v8, v16, v8, v0 +; RV32-NEXT: vmerge.vvm v8, v8, v16, v0 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 88 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: li a2, 68 +; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill -; RV32-NEXT: lui a1, %hi(.LCPI8_0) -; RV32-NEXT: addi a1, a1, %lo(.LCPI8_0) -; RV32-NEXT: lui a3, %hi(.LCPI8_5) -; RV32-NEXT: addi a3, a3, %lo(.LCPI8_5) -; RV32-NEXT: lui a4, 3073 -; RV32-NEXT: vsetvli zero, a2, e16, m4, ta, ma -; RV32-NEXT: vle16.v v24, (a3) -; RV32-NEXT: addi a3, a4, -1024 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vle16.v v2, (a1) -; RV32-NEXT: vmv.s.x v0, a3 +; RV32-NEXT: lui a1, 32 +; RV32-NEXT: addi a1, a1, 4 +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vmv.v.x v16, a1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: li a2, 40 +; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vrgatherei16.vv v16, v8, v24 -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vrgatherei16.vv v20, v8, v16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 6 +; RV32-NEXT: li a2, 52 +; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vsetivli zero, 12, e32, m4, tu, ma +; RV32-NEXT: vmv.v.v v20, v8 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 72 -; RV32-NEXT: mul a1, a1, a3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vmerge.vvm v16, v8, v16, v0 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: li a2, 84 +; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs4r.v v20, (a1) # Unknown-size Folded Spill +; RV32-NEXT: lui a1, 48 +; RV32-NEXT: lui a2, %hi(.LCPI8_3) +; RV32-NEXT: addi a2, a2, %lo(.LCPI8_3) +; RV32-NEXT: addi a1, a1, 5 +; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV32-NEXT: vle16.v v28, (a2) +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vmv.v.x v20, a1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 52 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: li a2, 24 +; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vrgatherei16.vv v16, v12, v2 -; RV32-NEXT: lui a1, %hi(.LCPI8_2) -; RV32-NEXT: addi a1, a1, %lo(.LCPI8_2) -; RV32-NEXT: vle16.v v12, (a1) -; RV32-NEXT: lui a1, %hi(.LCPI8_4) -; RV32-NEXT: addi a1, a1, %lo(.LCPI8_4) -; RV32-NEXT: vle16.v v14, (a1) +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vrgatherei16.vv v8, v12, v20 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 56 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: li a2, 44 +; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 12, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v16, v24 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 80 -; RV32-NEXT: mul a1, a1, a3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vmv.v.v v8, v16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 48 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: li a2, 52 +; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vrgatherei16.vv v24, v16, v12 +; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vrgatherei16.vv v24, v12, v28 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 40 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a1, a1, 5 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vsetivli zero, 12, e32, m4, tu, ma +; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma ; RV32-NEXT: vmv.v.v v24, v16 +; RV32-NEXT: lui a1, %hi(.LCPI8_4) +; RV32-NEXT: addi a1, a1, %lo(.LCPI8_4) +; RV32-NEXT: lui a2, %hi(.LCPI8_5) +; RV32-NEXT: addi a2, a2, %lo(.LCPI8_5) +; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; RV32-NEXT: vle16.v v12, (a1) +; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV32-NEXT: vle16.v v28, (a2) +; RV32-NEXT: lui a1, %hi(.LCPI8_6) +; RV32-NEXT: addi a1, a1, %lo(.LCPI8_6) +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vle16.v v30, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 72 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a1, a1, 4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v24, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v16, v0, v12 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: li a2, 12 +; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vrgatherei16.vv v20, v16, v14 -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl4r.v v20, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vrgatherei16.vv v12, v20, v28 ; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v20, v8 +; RV32-NEXT: vmv.v.v v12, v16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 6 +; RV32-NEXT: slli a1, a1, 2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v20, (a1) # Unknown-size Folded Spill -; RV32-NEXT: lui a1, %hi(.LCPI8_6) -; RV32-NEXT: addi a1, a1, %lo(.LCPI8_6) -; RV32-NEXT: lui a3, %hi(.LCPI8_7) -; RV32-NEXT: addi a3, a3, %lo(.LCPI8_7) -; RV32-NEXT: vsetvli zero, a2, e16, m4, ta, ma -; RV32-NEXT: vle16.v v20, (a3) -; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV32-NEXT: vle16.v v16, (a1) +; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vrgatherei16.vv v16, v0, v30 +; RV32-NEXT: lui a1, %hi(.LCPI8_7) +; RV32-NEXT: addi a1, a1, %lo(.LCPI8_7) +; RV32-NEXT: lui a2, %hi(.LCPI8_8) +; RV32-NEXT: addi a2, a2, %lo(.LCPI8_8) +; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV32-NEXT: vle16.v v20, (a1) ; RV32-NEXT: lui a1, %hi(.LCPI8_9) ; RV32-NEXT: addi a1, a1, %lo(.LCPI8_9) -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vle16.v v0, (a1) -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 28 -; RV32-NEXT: mul a1, a1, a3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v24, v20 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vrgatherei16.vv v20, v4, v16 -; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v20, v8 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 20 -; RV32-NEXT: mul a1, a1, a3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vrgatherei16.vv v8, v24, v0 -; RV32-NEXT: lui a1, %hi(.LCPI8_8) -; RV32-NEXT: addi a1, a1, %lo(.LCPI8_8) -; RV32-NEXT: lui a3, %hi(.LCPI8_10) -; RV32-NEXT: addi a3, a3, %lo(.LCPI8_10) ; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV32-NEXT: vle16.v v12, (a1) -; RV32-NEXT: lui a1, %hi(.LCPI8_11) -; RV32-NEXT: addi a1, a1, %lo(.LCPI8_11) -; RV32-NEXT: vsetvli zero, a2, e16, m4, ta, ma -; RV32-NEXT: vle16.v v16, (a1) -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vle16.v v14, (a3) -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 52 -; RV32-NEXT: mul a1, a1, a3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs2r.v v14, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vle16.v v8, (a2) +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vle16.v v10, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 36 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: li a2, 28 +; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v4, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v0, v4, v12 +; RV32-NEXT: vl4r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v28, v0, v20 ; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v0, v8 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vrgatherei16.vv v8, v24, v16 +; RV32-NEXT: vmv.v.v v28, v16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 56 +; RV32-NEXT: li a2, 76 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vrgatherei16.vv v16, v0, v8 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 88 +; RV32-NEXT: li a2, 60 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 52 +; RV32-NEXT: li a2, 68 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl2r.v v12, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vrgatherei16.vv v8, v16, v12 +; RV32-NEXT: vl4r.v v4, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vrgatherei16.vv v16, v4, v10 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 56 +; RV32-NEXT: li a2, 60 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v8, v24 +; RV32-NEXT: vmv.v.v v16, v0 ; RV32-NEXT: addi a1, a0, 320 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vse32.v v8, (a1) +; RV32-NEXT: vse32.v v16, (a1) ; RV32-NEXT: addi a1, a0, 256 -; RV32-NEXT: vse32.v v0, (a1) +; RV32-NEXT: vse32.v v28, (a1) ; RV32-NEXT: addi a1, a0, 192 -; RV32-NEXT: vse32.v v20, (a1) +; RV32-NEXT: vse32.v v12, (a1) ; RV32-NEXT: addi a1, a0, 128 -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 6 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 16 -; RV32-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload -; RV32-NEXT: vse32.v v8, (a1) +; RV32-NEXT: vse32.v v24, (a1) ; RV32-NEXT: addi a1, a0, 64 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 72 +; RV32-NEXT: li a3, 52 ; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 16 ; RV32-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vse32.v v8, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 80 +; RV32-NEXT: li a2, 84 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vse32.v v8, (a0) ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 96 +; RV32-NEXT: li a1, 92 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: .cfi_def_cfa sp, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll index f7647ff38c8a0..5fd7e47507f71 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll @@ -771,135 +771,43 @@ define <8 x double> @reverse_v8f64(<8 x double> %a) { define <3 x i64> @reverse_v3i64(<3 x i64> %a) { -; RV32-LABEL: reverse_v3i64: -; RV32: # %bb.0: -; RV32-NEXT: lui a0, %hi(.LCPI44_0) -; RV32-NEXT: addi a0, a0, %lo(.LCPI44_0) -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vle16.v v12, (a0) -; RV32-NEXT: vrgatherei16.vv v10, v8, v12 -; RV32-NEXT: vmv.v.v v8, v10 -; RV32-NEXT: ret -; -; RV64-LABEL: reverse_v3i64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; RV64-NEXT: vid.v v10 -; RV64-NEXT: vrsub.vi v12, v10, 2 -; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; RV64-NEXT: vrgatherei16.vv v10, v8, v12 -; RV64-NEXT: vmv.v.v v8, v10 -; RV64-NEXT: ret -; -; RV32-ZVBB-LABEL: reverse_v3i64: -; RV32-ZVBB: # %bb.0: -; RV32-ZVBB-NEXT: lui a0, %hi(.LCPI44_0) -; RV32-ZVBB-NEXT: addi a0, a0, %lo(.LCPI44_0) -; RV32-ZVBB-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-ZVBB-NEXT: vle16.v v12, (a0) -; RV32-ZVBB-NEXT: vrgatherei16.vv v10, v8, v12 -; RV32-ZVBB-NEXT: vmv.v.v v8, v10 -; RV32-ZVBB-NEXT: ret -; -; RV64-ZVBB-LABEL: reverse_v3i64: -; RV64-ZVBB: # %bb.0: -; RV64-ZVBB-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; RV64-ZVBB-NEXT: vid.v v10 -; RV64-ZVBB-NEXT: vrsub.vi v12, v10, 2 -; RV64-ZVBB-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; RV64-ZVBB-NEXT: vrgatherei16.vv v10, v8, v12 -; RV64-ZVBB-NEXT: vmv.v.v v8, v10 -; RV64-ZVBB-NEXT: ret +; CHECK-LABEL: reverse_v3i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: vrsub.vi v12, v10, 2 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vrgatherei16.vv v10, v8, v12 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: ret %res = shufflevector <3 x i64> %a, <3 x i64> poison, <3 x i32> ret <3 x i64> %res } define <6 x i64> @reverse_v6i64(<6 x i64> %a) { -; RV32-LABEL: reverse_v6i64: -; RV32: # %bb.0: -; RV32-NEXT: lui a0, %hi(.LCPI45_0) -; RV32-NEXT: addi a0, a0, %lo(.LCPI45_0) -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vle16.v v16, (a0) -; RV32-NEXT: vrgatherei16.vv v12, v8, v16 -; RV32-NEXT: vmv.v.v v8, v12 -; RV32-NEXT: ret -; -; RV64-LABEL: reverse_v6i64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV64-NEXT: vid.v v12 -; RV64-NEXT: vrsub.vi v16, v12, 5 -; RV64-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; RV64-NEXT: vrgatherei16.vv v12, v8, v16 -; RV64-NEXT: vmv.v.v v8, v12 -; RV64-NEXT: ret -; -; RV32-ZVBB-LABEL: reverse_v6i64: -; RV32-ZVBB: # %bb.0: -; RV32-ZVBB-NEXT: lui a0, %hi(.LCPI45_0) -; RV32-ZVBB-NEXT: addi a0, a0, %lo(.LCPI45_0) -; RV32-ZVBB-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-ZVBB-NEXT: vle16.v v16, (a0) -; RV32-ZVBB-NEXT: vrgatherei16.vv v12, v8, v16 -; RV32-ZVBB-NEXT: vmv.v.v v8, v12 -; RV32-ZVBB-NEXT: ret -; -; RV64-ZVBB-LABEL: reverse_v6i64: -; RV64-ZVBB: # %bb.0: -; RV64-ZVBB-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV64-ZVBB-NEXT: vid.v v12 -; RV64-ZVBB-NEXT: vrsub.vi v16, v12, 5 -; RV64-ZVBB-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; RV64-ZVBB-NEXT: vrgatherei16.vv v12, v8, v16 -; RV64-ZVBB-NEXT: vmv.v.v v8, v12 -; RV64-ZVBB-NEXT: ret +; CHECK-LABEL: reverse_v6i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: vrsub.vi v16, v12, 5 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vrgatherei16.vv v12, v8, v16 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret %res = shufflevector <6 x i64> %a, <6 x i64> poison, <6 x i32> ret <6 x i64> %res } define <12 x i64> @reverse_v12i64(<12 x i64> %a) { -; RV32-LABEL: reverse_v12i64: -; RV32: # %bb.0: -; RV32-NEXT: li a0, 32 -; RV32-NEXT: lui a1, %hi(.LCPI46_0) -; RV32-NEXT: addi a1, a1, %lo(.LCPI46_0) -; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; RV32-NEXT: vle16.v v24, (a1) -; RV32-NEXT: vrgatherei16.vv v16, v8, v24 -; RV32-NEXT: vmv.v.v v8, v16 -; RV32-NEXT: ret -; -; RV64-LABEL: reverse_v12i64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV64-NEXT: vid.v v16 -; RV64-NEXT: vrsub.vi v24, v16, 11 -; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; RV64-NEXT: vrgatherei16.vv v16, v8, v24 -; RV64-NEXT: vmv.v.v v8, v16 -; RV64-NEXT: ret -; -; RV32-ZVBB-LABEL: reverse_v12i64: -; RV32-ZVBB: # %bb.0: -; RV32-ZVBB-NEXT: li a0, 32 -; RV32-ZVBB-NEXT: lui a1, %hi(.LCPI46_0) -; RV32-ZVBB-NEXT: addi a1, a1, %lo(.LCPI46_0) -; RV32-ZVBB-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; RV32-ZVBB-NEXT: vle16.v v24, (a1) -; RV32-ZVBB-NEXT: vrgatherei16.vv v16, v8, v24 -; RV32-ZVBB-NEXT: vmv.v.v v8, v16 -; RV32-ZVBB-NEXT: ret -; -; RV64-ZVBB-LABEL: reverse_v12i64: -; RV64-ZVBB: # %bb.0: -; RV64-ZVBB-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV64-ZVBB-NEXT: vid.v v16 -; RV64-ZVBB-NEXT: vrsub.vi v24, v16, 11 -; RV64-ZVBB-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; RV64-ZVBB-NEXT: vrgatherei16.vv v16, v8, v24 -; RV64-ZVBB-NEXT: vmv.v.v v8, v16 -; RV64-ZVBB-NEXT: ret +; CHECK-LABEL: reverse_v12i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vid.v v16 +; CHECK-NEXT: vrsub.vi v24, v16, 11 +; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; CHECK-NEXT: vrgatherei16.vv v16, v8, v24 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret %res = shufflevector <12 x i64> %a, <12 x i64> poison, <12 x i32> ret <12 x i64> %res } @@ -1512,3 +1420,8 @@ define <16 x i32> @reverse_v16i32_exact_vlen_256(<16 x i32> %a) vscale_range(4, %res = shufflevector <16 x i32> %a, <16 x i32> poison, <16 x i32> ret <16 x i32> %res } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; RV32: {{.*}} +; RV32-ZVBB: {{.*}} +; RV64: {{.*}} +; RV64-ZVBB: {{.*}} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll index 02355d331e13f..464b4eca35aba 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll @@ -556,12 +556,14 @@ define <8 x i16> @shuffle_v8i16_as_i64_32(<8 x i16> %v) { ; ; ZVKB-ZVE32X-LABEL: shuffle_v8i16_as_i64_32: ; ZVKB-ZVE32X: # %bb.0: -; ZVKB-ZVE32X-NEXT: lui a0, %hi(.LCPI20_0) -; ZVKB-ZVE32X-NEXT: addi a0, a0, %lo(.LCPI20_0) -; ZVKB-ZVE32X-NEXT: vsetivli zero, 8, e16, m2, ta, ma -; ZVKB-ZVE32X-NEXT: vle8.v v10, (a0) +; ZVKB-ZVE32X-NEXT: lui a0, 8240 +; ZVKB-ZVE32X-NEXT: addi a0, a0, 1 +; ZVKB-ZVE32X-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; ZVKB-ZVE32X-NEXT: vmv.s.x v10, a0 +; ZVKB-ZVE32X-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; ZVKB-ZVE32X-NEXT: vsext.vf2 v12, v10 -; ZVKB-ZVE32X-NEXT: vrgather.vv v10, v8, v12 +; ZVKB-ZVE32X-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; ZVKB-ZVE32X-NEXT: vrgatherei16.vv v10, v8, v12 ; ZVKB-ZVE32X-NEXT: vmv.v.v v8, v10 ; ZVKB-ZVE32X-NEXT: ret %shuffle = shufflevector <8 x i16> %v, <8 x i16> poison, <8 x i32> @@ -765,12 +767,14 @@ define <8 x half> @shuffle_v8f16_as_i64_32(<8 x half> %v) { ; ; ZVKB-ZVE32X-LABEL: shuffle_v8f16_as_i64_32: ; ZVKB-ZVE32X: # %bb.0: -; ZVKB-ZVE32X-NEXT: lui a0, %hi(.LCPI25_0) -; ZVKB-ZVE32X-NEXT: addi a0, a0, %lo(.LCPI25_0) -; ZVKB-ZVE32X-NEXT: vsetivli zero, 8, e16, m2, ta, ma -; ZVKB-ZVE32X-NEXT: vle8.v v10, (a0) +; ZVKB-ZVE32X-NEXT: lui a0, 8240 +; ZVKB-ZVE32X-NEXT: addi a0, a0, 1 +; ZVKB-ZVE32X-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; ZVKB-ZVE32X-NEXT: vmv.s.x v10, a0 +; ZVKB-ZVE32X-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; ZVKB-ZVE32X-NEXT: vsext.vf2 v12, v10 -; ZVKB-ZVE32X-NEXT: vrgather.vv v10, v8, v12 +; ZVKB-ZVE32X-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; ZVKB-ZVE32X-NEXT: vrgatherei16.vv v10, v8, v12 ; ZVKB-ZVE32X-NEXT: vmv.v.v v8, v10 ; ZVKB-ZVE32X-NEXT: ret %shuffle = shufflevector <8 x half> %v, <8 x half> poison, <8 x i32>