diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 6c58989b1afb4..d3fc182319bba 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -1530,7 +1530,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, ISD::BUILD_VECTOR, ISD::CONCAT_VECTORS, ISD::EXPERIMENTAL_VP_REVERSE, ISD::MUL, ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM, - ISD::INSERT_VECTOR_ELT, ISD::ABS, ISD::CTPOP}); + ISD::INSERT_VECTOR_ELT, ISD::ABS, ISD::CTPOP, + ISD::VECTOR_SHUFFLE}); if (Subtarget.hasVendorXTHeadMemPair()) setTargetDAGCombine({ISD::LOAD, ISD::STORE}); if (Subtarget.useRVVForFixedLengthVectors()) @@ -16926,6 +16927,37 @@ static SDValue performCONCAT_VECTORSCombine(SDNode *N, SelectionDAG &DAG, return DAG.getBitcast(VT.getSimpleVT(), StridedLoad); } +/// Custom legalize or to . This runs +/// during the combine phase before type legalization, and relies on +/// DAGCombine not undoing the transform if isShuffleMaskLegal returns false +/// for the source mask. +static SDValue performVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG, + const RISCVSubtarget &Subtarget, + const RISCVTargetLowering &TLI) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + const unsigned ElementSize = VT.getScalarSizeInBits(); + SDValue V1 = N->getOperand(0); + SDValue V2 = N->getOperand(1); + ArrayRef Mask = cast(N)->getMask(); + + if (TLI.isTypeLegal(VT) || ElementSize <= Subtarget.getELen() || + !isPowerOf2_64(ElementSize) || VT.getVectorNumElements() % 2 != 0 || + VT.isFloatingPoint() || TLI.isShuffleMaskLegal(Mask, VT)) + return SDValue(); + + SmallVector NewMask; + narrowShuffleMaskElts(2, Mask, NewMask); + + LLVMContext &C = *DAG.getContext(); + EVT NewEltVT = EVT::getIntegerVT(C, ElementSize / 2); + EVT NewVT = EVT::getVectorVT(C, NewEltVT, VT.getVectorNumElements() * 2); + SDValue Res = DAG.getVectorShuffle(NewVT, DL, DAG.getBitcast(NewVT, V1), + DAG.getBitcast(NewVT, V2), NewMask); + return DAG.getBitcast(VT, Res); +} + + static SDValue combineToVWMACC(SDNode *N, SelectionDAG &DAG, const RISCVSubtarget &Subtarget) { @@ -18155,6 +18187,10 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, if (SDValue V = performCONCAT_VECTORSCombine(N, DAG, Subtarget, *this)) return V; break; + case ISD::VECTOR_SHUFFLE: + if (SDValue V = performVECTOR_SHUFFLECombine(N, DAG, Subtarget, *this)) + return V; + break; case ISD::INSERT_VECTOR_ELT: if (SDValue V = performINSERT_VECTOR_ELTCombine(N, DAG, Subtarget, *this)) return V; diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll index 8915603471ec7..f397fdde2cba7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll @@ -1145,132 +1145,200 @@ define <16 x i32> @shuffle_disjoint_lanes_one_splat(i32 %v, <16 x i32> %w) { define <4 x i128> @shuffle_i128(<4 x i128> %a) { ; RV32-LABEL: shuffle_i128: ; RV32: # %bb.0: -; RV32-NEXT: lw a2, 0(a1) -; RV32-NEXT: lw a3, 4(a1) -; RV32-NEXT: lw a4, 8(a1) -; RV32-NEXT: lw a5, 12(a1) -; RV32-NEXT: lw a6, 48(a1) -; RV32-NEXT: lw a7, 52(a1) -; RV32-NEXT: lw t0, 56(a1) -; RV32-NEXT: lw t1, 60(a1) -; RV32-NEXT: lw t2, 32(a1) -; RV32-NEXT: lw t3, 36(a1) -; RV32-NEXT: lw t4, 40(a1) -; RV32-NEXT: lw a1, 44(a1) -; RV32-NEXT: sw t2, 48(a0) -; RV32-NEXT: sw t3, 52(a0) -; RV32-NEXT: sw t4, 56(a0) -; RV32-NEXT: sw a1, 60(a0) -; RV32-NEXT: sw a6, 32(a0) -; RV32-NEXT: sw a7, 36(a0) -; RV32-NEXT: sw t0, 40(a0) -; RV32-NEXT: sw t1, 44(a0) -; RV32-NEXT: sw a2, 16(a0) -; RV32-NEXT: sw a3, 20(a0) -; RV32-NEXT: sw a4, 24(a0) -; RV32-NEXT: sw a5, 28(a0) -; RV32-NEXT: sw a2, 0(a0) -; RV32-NEXT: sw a3, 4(a0) -; RV32-NEXT: sw a4, 8(a0) -; RV32-NEXT: sw a5, 12(a0) +; RV32-NEXT: addi sp, sp, -128 +; RV32-NEXT: .cfi_def_cfa_offset 128 +; RV32-NEXT: sw ra, 124(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 120(sp) # 4-byte Folded Spill +; RV32-NEXT: .cfi_offset ra, -4 +; RV32-NEXT: .cfi_offset s0, -8 +; RV32-NEXT: addi s0, sp, 128 +; RV32-NEXT: .cfi_def_cfa s0, 0 +; RV32-NEXT: andi sp, sp, -64 +; RV32-NEXT: lw a2, 60(a1) +; RV32-NEXT: sw a2, 60(sp) +; RV32-NEXT: lw a2, 56(a1) +; RV32-NEXT: sw a2, 56(sp) +; RV32-NEXT: lw a2, 52(a1) +; RV32-NEXT: sw a2, 52(sp) +; RV32-NEXT: lw a2, 48(a1) +; RV32-NEXT: sw a2, 48(sp) +; RV32-NEXT: lw a2, 44(a1) +; RV32-NEXT: sw a2, 44(sp) +; RV32-NEXT: lw a2, 40(a1) +; RV32-NEXT: sw a2, 40(sp) +; RV32-NEXT: lw a2, 36(a1) +; RV32-NEXT: sw a2, 36(sp) +; RV32-NEXT: lw a2, 32(a1) +; RV32-NEXT: sw a2, 32(sp) +; RV32-NEXT: lw a2, 12(a1) +; RV32-NEXT: sw a2, 12(sp) +; RV32-NEXT: lw a2, 8(a1) +; RV32-NEXT: sw a2, 8(sp) +; RV32-NEXT: lw a2, 4(a1) +; RV32-NEXT: sw a2, 4(sp) +; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: mv a2, sp +; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: lui a1, %hi(.LCPI78_0) +; RV32-NEXT: addi a1, a1, %lo(.LCPI78_0) +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-NEXT: vle32.v v8, (a2) +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vle16.v v12, (a1) +; RV32-NEXT: vrgatherei16.vv v16, v8, v12 +; RV32-NEXT: vse64.v v16, (a0) +; RV32-NEXT: addi sp, s0, -128 +; RV32-NEXT: .cfi_def_cfa sp, 128 +; RV32-NEXT: lw ra, 124(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s0, 120(sp) # 4-byte Folded Reload +; RV32-NEXT: .cfi_restore ra +; RV32-NEXT: .cfi_restore s0 +; RV32-NEXT: addi sp, sp, 128 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: shuffle_i128: ; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -128 +; RV64-NEXT: .cfi_def_cfa_offset 128 +; RV64-NEXT: sd ra, 120(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 112(sp) # 8-byte Folded Spill +; RV64-NEXT: .cfi_offset ra, -8 +; RV64-NEXT: .cfi_offset s0, -16 +; RV64-NEXT: addi s0, sp, 128 +; RV64-NEXT: .cfi_def_cfa s0, 0 +; RV64-NEXT: andi sp, sp, -64 +; RV64-NEXT: ld a2, 56(a1) +; RV64-NEXT: sd a2, 56(sp) ; RV64-NEXT: ld a2, 48(a1) -; RV64-NEXT: ld a3, 56(a1) -; RV64-NEXT: ld a4, 0(a1) -; RV64-NEXT: ld a5, 8(a1) -; RV64-NEXT: ld a6, 32(a1) -; RV64-NEXT: ld a1, 40(a1) -; RV64-NEXT: sd a2, 32(a0) -; RV64-NEXT: sd a3, 40(a0) -; RV64-NEXT: sd a6, 48(a0) -; RV64-NEXT: sd a1, 56(a0) -; RV64-NEXT: sd a4, 0(a0) -; RV64-NEXT: sd a5, 8(a0) -; RV64-NEXT: sd a4, 16(a0) -; RV64-NEXT: sd a5, 24(a0) +; RV64-NEXT: sd a2, 48(sp) +; RV64-NEXT: ld a2, 40(a1) +; RV64-NEXT: sd a2, 40(sp) +; RV64-NEXT: ld a2, 32(a1) +; RV64-NEXT: sd a2, 32(sp) +; RV64-NEXT: ld a2, 8(a1) +; RV64-NEXT: sd a2, 8(sp) +; RV64-NEXT: ld a1, 0(a1) +; RV64-NEXT: mv a2, sp +; RV64-NEXT: sd a1, 0(sp) +; RV64-NEXT: lui a1, %hi(.LCPI78_0) +; RV64-NEXT: addi a1, a1, %lo(.LCPI78_0) +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64-NEXT: vle64.v v8, (a2) +; RV64-NEXT: vle16.v v12, (a1) +; RV64-NEXT: vrgatherei16.vv v16, v8, v12 +; RV64-NEXT: vse64.v v16, (a0) +; RV64-NEXT: addi sp, s0, -128 +; RV64-NEXT: .cfi_def_cfa sp, 128 +; RV64-NEXT: ld ra, 120(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s0, 112(sp) # 8-byte Folded Reload +; RV64-NEXT: .cfi_restore ra +; RV64-NEXT: .cfi_restore s0 +; RV64-NEXT: addi sp, sp, 128 +; RV64-NEXT: .cfi_def_cfa_offset 0 ; RV64-NEXT: ret %res = shufflevector <4 x i128> %a, <4 x i128> poison, <4 x i32> ret <4 x i128> %res } define void @shuffle_i128_ldst(ptr %p) { -; RV32-LABEL: shuffle_i128_ldst: +; CHECK-LABEL: shuffle_i128_ldst: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: lui a1, %hi(.LCPI79_0) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI79_0) +; CHECK-NEXT: vle16.v v12, (a1) +; CHECK-NEXT: vrgatherei16.vv v16, v8, v12 +; CHECK-NEXT: vse64.v v16, (a0) +; CHECK-NEXT: ret + %a = load <4 x i128>, ptr %p + %res = shufflevector <4 x i128> %a, <4 x i128> poison, <4 x i32> + store <4 x i128> %res, ptr %p + ret void +} + +define void @shuffle_i256_ldst(ptr %p) { +; CHECK-LABEL: shuffle_i256_ldst: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a1, %hi(.LCPI80_0) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI80_0) +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vle8.v v8, (a1) +; CHECK-NEXT: vle64.v v16, (a0) +; CHECK-NEXT: vsext.vf2 v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; CHECK-NEXT: vrgatherei16.vv v24, v16, v10 +; CHECK-NEXT: vse64.v v24, (a0) +; CHECK-NEXT: ret + %a = load <4 x i256>, ptr %p + %res = shufflevector <4 x i256> %a, <4 x i256> poison, <4 x i32> + store <4 x i256> %res, ptr %p + ret void +} + +define void @shuffle_i64_splat(ptr %p) nounwind { +; RV32-LABEL: shuffle_i64_splat: ; RV32: # %bb.0: -; RV32-NEXT: lw a1, 48(a0) -; RV32-NEXT: lw a2, 52(a0) -; RV32-NEXT: lw a3, 56(a0) -; RV32-NEXT: lw a4, 60(a0) -; RV32-NEXT: lw a5, 0(a0) -; RV32-NEXT: lw a6, 4(a0) -; RV32-NEXT: lw a7, 8(a0) -; RV32-NEXT: lw t0, 12(a0) -; RV32-NEXT: lw t1, 32(a0) -; RV32-NEXT: lw t2, 36(a0) -; RV32-NEXT: lw t3, 40(a0) -; RV32-NEXT: lw t4, 44(a0) -; RV32-NEXT: sw t1, 48(a0) -; RV32-NEXT: sw t2, 52(a0) -; RV32-NEXT: sw t3, 56(a0) -; RV32-NEXT: sw t4, 60(a0) -; RV32-NEXT: sw a5, 16(a0) -; RV32-NEXT: sw a6, 20(a0) -; RV32-NEXT: sw a7, 24(a0) -; RV32-NEXT: sw t0, 28(a0) +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v8, (a0), zero +; RV32-NEXT: vse64.v v8, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: shuffle_i64_splat: +; RV64: # %bb.0: +; RV64-NEXT: ld a1, 0(a0) +; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV64-NEXT: vmv.v.x v8, a1 +; RV64-NEXT: vse64.v v8, (a0) +; RV64-NEXT: ret + %a = load <4 x i64>, ptr %p + %res = shufflevector <4 x i64> %a, <4 x i64> poison, <4 x i32> + store <4 x i64> %res, ptr %p + ret void +} + +define void @shuffle_i128_splat(ptr %p) nounwind { +; RV32-LABEL: shuffle_i128_splat: +; RV32: # %bb.0: +; RV32-NEXT: lw a1, 0(a0) +; RV32-NEXT: lw a2, 4(a0) +; RV32-NEXT: lw a3, 8(a0) +; RV32-NEXT: lw a4, 12(a0) +; RV32-NEXT: sw a1, 48(a0) +; RV32-NEXT: sw a2, 52(a0) +; RV32-NEXT: sw a3, 56(a0) +; RV32-NEXT: sw a4, 60(a0) +; RV32-NEXT: sw a1, 16(a0) +; RV32-NEXT: sw a2, 20(a0) +; RV32-NEXT: sw a3, 24(a0) +; RV32-NEXT: sw a4, 28(a0) ; RV32-NEXT: sw a1, 32(a0) ; RV32-NEXT: sw a2, 36(a0) ; RV32-NEXT: sw a3, 40(a0) ; RV32-NEXT: sw a4, 44(a0) ; RV32-NEXT: ret ; -; RV64-LABEL: shuffle_i128_ldst: +; RV64-LABEL: shuffle_i128_splat: ; RV64: # %bb.0: ; RV64-NEXT: ld a1, 0(a0) ; RV64-NEXT: ld a2, 8(a0) -; RV64-NEXT: ld a3, 32(a0) -; RV64-NEXT: ld a4, 40(a0) -; RV64-NEXT: ld a5, 48(a0) -; RV64-NEXT: ld a6, 56(a0) -; RV64-NEXT: sd a3, 48(a0) -; RV64-NEXT: sd a4, 56(a0) +; RV64-NEXT: sd a1, 48(a0) +; RV64-NEXT: sd a2, 56(a0) ; RV64-NEXT: sd a1, 16(a0) ; RV64-NEXT: sd a2, 24(a0) -; RV64-NEXT: sd a5, 32(a0) -; RV64-NEXT: sd a6, 40(a0) +; RV64-NEXT: sd a1, 32(a0) +; RV64-NEXT: sd a2, 40(a0) ; RV64-NEXT: ret %a = load <4 x i128>, ptr %p - %res = shufflevector <4 x i128> %a, <4 x i128> poison, <4 x i32> + %res = shufflevector <4 x i128> %a, <4 x i128> poison, <4 x i32> store <4 x i128> %res, ptr %p ret void } -define void @shuffle_i256_ldst(ptr %p) { -; RV32-LABEL: shuffle_i256_ldst: +define void @shuffle_i256_splat(ptr %p) nounwind { +; RV32-LABEL: shuffle_i256_splat: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -48 -; RV32-NEXT: .cfi_def_cfa_offset 48 -; RV32-NEXT: sw s0, 44(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s1, 40(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s2, 36(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s3, 32(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s4, 28(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s5, 24(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s6, 20(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s7, 16(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s8, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s9, 8(sp) # 4-byte Folded Spill -; RV32-NEXT: .cfi_offset s0, -4 -; RV32-NEXT: .cfi_offset s1, -8 -; RV32-NEXT: .cfi_offset s2, -12 -; RV32-NEXT: .cfi_offset s3, -16 -; RV32-NEXT: .cfi_offset s4, -20 -; RV32-NEXT: .cfi_offset s5, -24 -; RV32-NEXT: .cfi_offset s6, -28 -; RV32-NEXT: .cfi_offset s7, -32 -; RV32-NEXT: .cfi_offset s8, -36 -; RV32-NEXT: .cfi_offset s9, -40 ; RV32-NEXT: lw a1, 0(a0) ; RV32-NEXT: lw a2, 4(a0) ; RV32-NEXT: lw a3, 8(a0) @@ -1279,38 +1347,22 @@ define void @shuffle_i256_ldst(ptr %p) { ; RV32-NEXT: lw a6, 20(a0) ; RV32-NEXT: lw a7, 24(a0) ; RV32-NEXT: lw t0, 28(a0) -; RV32-NEXT: lw t1, 96(a0) -; RV32-NEXT: lw t2, 100(a0) -; RV32-NEXT: lw t3, 104(a0) -; RV32-NEXT: lw t4, 108(a0) -; RV32-NEXT: lw t5, 112(a0) -; RV32-NEXT: lw t6, 116(a0) -; RV32-NEXT: lw s0, 120(a0) -; RV32-NEXT: lw s1, 124(a0) -; RV32-NEXT: lw s2, 64(a0) -; RV32-NEXT: lw s3, 68(a0) -; RV32-NEXT: lw s4, 72(a0) -; RV32-NEXT: lw s5, 76(a0) -; RV32-NEXT: lw s6, 80(a0) -; RV32-NEXT: lw s7, 84(a0) -; RV32-NEXT: lw s8, 88(a0) -; RV32-NEXT: lw s9, 92(a0) -; RV32-NEXT: sw s6, 112(a0) -; RV32-NEXT: sw s7, 116(a0) -; RV32-NEXT: sw s8, 120(a0) -; RV32-NEXT: sw s9, 124(a0) -; RV32-NEXT: sw s2, 96(a0) -; RV32-NEXT: sw s3, 100(a0) -; RV32-NEXT: sw s4, 104(a0) -; RV32-NEXT: sw s5, 108(a0) -; RV32-NEXT: sw t5, 80(a0) -; RV32-NEXT: sw t6, 84(a0) -; RV32-NEXT: sw s0, 88(a0) -; RV32-NEXT: sw s1, 92(a0) -; RV32-NEXT: sw t1, 64(a0) -; RV32-NEXT: sw t2, 68(a0) -; RV32-NEXT: sw t3, 72(a0) -; RV32-NEXT: sw t4, 76(a0) +; RV32-NEXT: sw a5, 112(a0) +; RV32-NEXT: sw a6, 116(a0) +; RV32-NEXT: sw a7, 120(a0) +; RV32-NEXT: sw t0, 124(a0) +; RV32-NEXT: sw a1, 96(a0) +; RV32-NEXT: sw a2, 100(a0) +; RV32-NEXT: sw a3, 104(a0) +; RV32-NEXT: sw a4, 108(a0) +; RV32-NEXT: sw a5, 80(a0) +; RV32-NEXT: sw a6, 84(a0) +; RV32-NEXT: sw a7, 88(a0) +; RV32-NEXT: sw t0, 92(a0) +; RV32-NEXT: sw a1, 64(a0) +; RV32-NEXT: sw a2, 68(a0) +; RV32-NEXT: sw a3, 72(a0) +; RV32-NEXT: sw a4, 76(a0) ; RV32-NEXT: sw a5, 48(a0) ; RV32-NEXT: sw a6, 52(a0) ; RV32-NEXT: sw a7, 56(a0) @@ -1319,59 +1371,30 @@ define void @shuffle_i256_ldst(ptr %p) { ; RV32-NEXT: sw a2, 36(a0) ; RV32-NEXT: sw a3, 40(a0) ; RV32-NEXT: sw a4, 44(a0) -; RV32-NEXT: lw s0, 44(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s1, 40(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s2, 36(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s3, 32(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s4, 28(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s5, 24(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s6, 20(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s7, 16(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s8, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s9, 8(sp) # 4-byte Folded Reload -; RV32-NEXT: .cfi_restore s0 -; RV32-NEXT: .cfi_restore s1 -; RV32-NEXT: .cfi_restore s2 -; RV32-NEXT: .cfi_restore s3 -; RV32-NEXT: .cfi_restore s4 -; RV32-NEXT: .cfi_restore s5 -; RV32-NEXT: .cfi_restore s6 -; RV32-NEXT: .cfi_restore s7 -; RV32-NEXT: .cfi_restore s8 -; RV32-NEXT: .cfi_restore s9 -; RV32-NEXT: addi sp, sp, 48 -; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; -; RV64-LABEL: shuffle_i256_ldst: +; RV64-LABEL: shuffle_i256_splat: ; RV64: # %bb.0: -; RV64-NEXT: ld a1, 96(a0) -; RV64-NEXT: ld a2, 104(a0) -; RV64-NEXT: ld a3, 112(a0) -; RV64-NEXT: ld a4, 120(a0) -; RV64-NEXT: ld a5, 0(a0) -; RV64-NEXT: ld a6, 8(a0) -; RV64-NEXT: ld a7, 16(a0) -; RV64-NEXT: ld t0, 24(a0) -; RV64-NEXT: ld t1, 64(a0) -; RV64-NEXT: ld t2, 72(a0) -; RV64-NEXT: ld t3, 80(a0) -; RV64-NEXT: ld t4, 88(a0) -; RV64-NEXT: sd t1, 96(a0) -; RV64-NEXT: sd t2, 104(a0) -; RV64-NEXT: sd t3, 112(a0) -; RV64-NEXT: sd t4, 120(a0) -; RV64-NEXT: sd a5, 32(a0) -; RV64-NEXT: sd a6, 40(a0) -; RV64-NEXT: sd a7, 48(a0) -; RV64-NEXT: sd t0, 56(a0) +; RV64-NEXT: ld a1, 0(a0) +; RV64-NEXT: ld a2, 8(a0) +; RV64-NEXT: ld a3, 16(a0) +; RV64-NEXT: ld a4, 24(a0) +; RV64-NEXT: sd a1, 96(a0) +; RV64-NEXT: sd a2, 104(a0) +; RV64-NEXT: sd a3, 112(a0) +; RV64-NEXT: sd a4, 120(a0) +; RV64-NEXT: sd a1, 32(a0) +; RV64-NEXT: sd a2, 40(a0) +; RV64-NEXT: sd a3, 48(a0) +; RV64-NEXT: sd a4, 56(a0) ; RV64-NEXT: sd a1, 64(a0) ; RV64-NEXT: sd a2, 72(a0) ; RV64-NEXT: sd a3, 80(a0) ; RV64-NEXT: sd a4, 88(a0) ; RV64-NEXT: ret %a = load <4 x i256>, ptr %p - %res = shufflevector <4 x i256> %a, <4 x i256> poison, <4 x i32> + %res = shufflevector <4 x i256> %a, <4 x i256> poison, <4 x i32> store <4 x i256> %res, ptr %p ret void } + diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll index 4603c0d24f5d7..a42a4b0d3531c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll @@ -402,49 +402,16 @@ entry: } define void @shuffle_i128_ldst(ptr %p) vscale_range(2,2) { -; RV32-LABEL: shuffle_i128_ldst: -; RV32: # %bb.0: -; RV32-NEXT: lw a1, 48(a0) -; RV32-NEXT: lw a2, 52(a0) -; RV32-NEXT: lw a3, 56(a0) -; RV32-NEXT: lw a4, 60(a0) -; RV32-NEXT: lw a5, 0(a0) -; RV32-NEXT: lw a6, 4(a0) -; RV32-NEXT: lw a7, 8(a0) -; RV32-NEXT: lw t0, 12(a0) -; RV32-NEXT: lw t1, 32(a0) -; RV32-NEXT: lw t2, 36(a0) -; RV32-NEXT: lw t3, 40(a0) -; RV32-NEXT: lw t4, 44(a0) -; RV32-NEXT: sw t1, 48(a0) -; RV32-NEXT: sw t2, 52(a0) -; RV32-NEXT: sw t3, 56(a0) -; RV32-NEXT: sw t4, 60(a0) -; RV32-NEXT: sw a5, 16(a0) -; RV32-NEXT: sw a6, 20(a0) -; RV32-NEXT: sw a7, 24(a0) -; RV32-NEXT: sw t0, 28(a0) -; RV32-NEXT: sw a1, 32(a0) -; RV32-NEXT: sw a2, 36(a0) -; RV32-NEXT: sw a3, 40(a0) -; RV32-NEXT: sw a4, 44(a0) -; RV32-NEXT: ret -; -; RV64-LABEL: shuffle_i128_ldst: -; RV64: # %bb.0: -; RV64-NEXT: ld a1, 0(a0) -; RV64-NEXT: ld a2, 8(a0) -; RV64-NEXT: ld a3, 32(a0) -; RV64-NEXT: ld a4, 40(a0) -; RV64-NEXT: ld a5, 48(a0) -; RV64-NEXT: ld a6, 56(a0) -; RV64-NEXT: sd a3, 48(a0) -; RV64-NEXT: sd a4, 56(a0) -; RV64-NEXT: sd a1, 16(a0) -; RV64-NEXT: sd a2, 24(a0) -; RV64-NEXT: sd a5, 32(a0) -; RV64-NEXT: sd a6, 40(a0) -; RV64-NEXT: ret +; CHECK-LABEL: shuffle_i128_ldst: +; CHECK: # %bb.0: +; CHECK-NEXT: vl4re64.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vmv4r.v v12, v8 +; CHECK-NEXT: vmv1r.v v14, v11 +; CHECK-NEXT: vmv1r.v v15, v10 +; CHECK-NEXT: vs4r.v v12, (a0) +; CHECK-NEXT: ret %a = load <4 x i128>, ptr %p %res = shufflevector <4 x i128> %a, <4 x i128> poison, <4 x i32> store <4 x i128> %res, ptr %p @@ -452,129 +419,19 @@ define void @shuffle_i128_ldst(ptr %p) vscale_range(2,2) { } define void @shuffle_i256_ldst(ptr %p) vscale_range(2,2) { -; RV32-LABEL: shuffle_i256_ldst: -; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -48 -; RV32-NEXT: .cfi_def_cfa_offset 48 -; RV32-NEXT: sw s0, 44(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s1, 40(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s2, 36(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s3, 32(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s4, 28(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s5, 24(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s6, 20(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s7, 16(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s8, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s9, 8(sp) # 4-byte Folded Spill -; RV32-NEXT: .cfi_offset s0, -4 -; RV32-NEXT: .cfi_offset s1, -8 -; RV32-NEXT: .cfi_offset s2, -12 -; RV32-NEXT: .cfi_offset s3, -16 -; RV32-NEXT: .cfi_offset s4, -20 -; RV32-NEXT: .cfi_offset s5, -24 -; RV32-NEXT: .cfi_offset s6, -28 -; RV32-NEXT: .cfi_offset s7, -32 -; RV32-NEXT: .cfi_offset s8, -36 -; RV32-NEXT: .cfi_offset s9, -40 -; RV32-NEXT: lw a1, 0(a0) -; RV32-NEXT: lw a2, 4(a0) -; RV32-NEXT: lw a3, 8(a0) -; RV32-NEXT: lw a4, 12(a0) -; RV32-NEXT: lw a5, 16(a0) -; RV32-NEXT: lw a6, 20(a0) -; RV32-NEXT: lw a7, 24(a0) -; RV32-NEXT: lw t0, 28(a0) -; RV32-NEXT: lw t1, 96(a0) -; RV32-NEXT: lw t2, 100(a0) -; RV32-NEXT: lw t3, 104(a0) -; RV32-NEXT: lw t4, 108(a0) -; RV32-NEXT: lw t5, 112(a0) -; RV32-NEXT: lw t6, 116(a0) -; RV32-NEXT: lw s0, 120(a0) -; RV32-NEXT: lw s1, 124(a0) -; RV32-NEXT: lw s2, 64(a0) -; RV32-NEXT: lw s3, 68(a0) -; RV32-NEXT: lw s4, 72(a0) -; RV32-NEXT: lw s5, 76(a0) -; RV32-NEXT: lw s6, 80(a0) -; RV32-NEXT: lw s7, 84(a0) -; RV32-NEXT: lw s8, 88(a0) -; RV32-NEXT: lw s9, 92(a0) -; RV32-NEXT: sw s6, 112(a0) -; RV32-NEXT: sw s7, 116(a0) -; RV32-NEXT: sw s8, 120(a0) -; RV32-NEXT: sw s9, 124(a0) -; RV32-NEXT: sw s2, 96(a0) -; RV32-NEXT: sw s3, 100(a0) -; RV32-NEXT: sw s4, 104(a0) -; RV32-NEXT: sw s5, 108(a0) -; RV32-NEXT: sw t5, 80(a0) -; RV32-NEXT: sw t6, 84(a0) -; RV32-NEXT: sw s0, 88(a0) -; RV32-NEXT: sw s1, 92(a0) -; RV32-NEXT: sw t1, 64(a0) -; RV32-NEXT: sw t2, 68(a0) -; RV32-NEXT: sw t3, 72(a0) -; RV32-NEXT: sw t4, 76(a0) -; RV32-NEXT: sw a5, 48(a0) -; RV32-NEXT: sw a6, 52(a0) -; RV32-NEXT: sw a7, 56(a0) -; RV32-NEXT: sw t0, 60(a0) -; RV32-NEXT: sw a1, 32(a0) -; RV32-NEXT: sw a2, 36(a0) -; RV32-NEXT: sw a3, 40(a0) -; RV32-NEXT: sw a4, 44(a0) -; RV32-NEXT: lw s0, 44(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s1, 40(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s2, 36(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s3, 32(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s4, 28(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s5, 24(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s6, 20(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s7, 16(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s8, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s9, 8(sp) # 4-byte Folded Reload -; RV32-NEXT: .cfi_restore s0 -; RV32-NEXT: .cfi_restore s1 -; RV32-NEXT: .cfi_restore s2 -; RV32-NEXT: .cfi_restore s3 -; RV32-NEXT: .cfi_restore s4 -; RV32-NEXT: .cfi_restore s5 -; RV32-NEXT: .cfi_restore s6 -; RV32-NEXT: .cfi_restore s7 -; RV32-NEXT: .cfi_restore s8 -; RV32-NEXT: .cfi_restore s9 -; RV32-NEXT: addi sp, sp, 48 -; RV32-NEXT: .cfi_def_cfa_offset 0 -; RV32-NEXT: ret -; -; RV64-LABEL: shuffle_i256_ldst: -; RV64: # %bb.0: -; RV64-NEXT: ld a1, 96(a0) -; RV64-NEXT: ld a2, 104(a0) -; RV64-NEXT: ld a3, 112(a0) -; RV64-NEXT: ld a4, 120(a0) -; RV64-NEXT: ld a5, 0(a0) -; RV64-NEXT: ld a6, 8(a0) -; RV64-NEXT: ld a7, 16(a0) -; RV64-NEXT: ld t0, 24(a0) -; RV64-NEXT: ld t1, 64(a0) -; RV64-NEXT: ld t2, 72(a0) -; RV64-NEXT: ld t3, 80(a0) -; RV64-NEXT: ld t4, 88(a0) -; RV64-NEXT: sd t1, 96(a0) -; RV64-NEXT: sd t2, 104(a0) -; RV64-NEXT: sd t3, 112(a0) -; RV64-NEXT: sd t4, 120(a0) -; RV64-NEXT: sd a5, 32(a0) -; RV64-NEXT: sd a6, 40(a0) -; RV64-NEXT: sd a7, 48(a0) -; RV64-NEXT: sd t0, 56(a0) -; RV64-NEXT: sd a1, 64(a0) -; RV64-NEXT: sd a2, 72(a0) -; RV64-NEXT: sd a3, 80(a0) -; RV64-NEXT: sd a4, 88(a0) -; RV64-NEXT: ret +; CHECK-LABEL: shuffle_i256_ldst: +; CHECK: # %bb.0: +; CHECK-NEXT: vl8re64.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vmv1r.v v11, v9 +; CHECK-NEXT: vmv8r.v v16, v8 +; CHECK-NEXT: vmv1r.v v20, v14 +; CHECK-NEXT: vmv1r.v v21, v15 +; CHECK-NEXT: vmv1r.v v22, v12 +; CHECK-NEXT: vmv1r.v v23, v13 +; CHECK-NEXT: vs8r.v v16, (a0) +; CHECK-NEXT: ret %a = load <4 x i256>, ptr %p %res = shufflevector <4 x i256> %a, <4 x i256> poison, <4 x i32> store <4 x i256> %res, ptr %p