diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 4544a922def1a..94a471b7b6a9f 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -5331,17 +5331,32 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, // Extract the halves of the vectors. MVT HalfVT = VT.getHalfNumVectorElementsVT(); + // Recognize if one half is actually undef; the matching above will + // otherwise reuse the even stream for the undef one. This improves + // spread(2) shuffles. + bool LaneIsUndef[2] = { true, true}; + for (unsigned i = 0; i < Mask.size(); i++) + LaneIsUndef[i % 2] &= (Mask[i] == -1); + int Size = Mask.size(); SDValue EvenV, OddV; - assert(EvenSrc >= 0 && "Undef source?"); - EvenV = (EvenSrc / Size) == 0 ? V1 : V2; - EvenV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, EvenV, - DAG.getVectorIdxConstant(EvenSrc % Size, DL)); - - assert(OddSrc >= 0 && "Undef source?"); - OddV = (OddSrc / Size) == 0 ? V1 : V2; - OddV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, OddV, - DAG.getVectorIdxConstant(OddSrc % Size, DL)); + if (LaneIsUndef[0]) { + EvenV = DAG.getUNDEF(HalfVT); + } else { + assert(EvenSrc >= 0 && "Undef source?"); + EvenV = (EvenSrc / Size) == 0 ? V1 : V2; + EvenV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, EvenV, + DAG.getVectorIdxConstant(EvenSrc % Size, DL)); + } + + if (LaneIsUndef[1]) { + OddV = DAG.getUNDEF(HalfVT); + } else { + assert(OddSrc >= 0 && "Undef source?"); + OddV = (OddSrc / Size) == 0 ? V1 : V2; + OddV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, OddV, + DAG.getVectorIdxConstant(OddSrc % Size, DL)); + } return getWideningInterleave(EvenV, OddV, DL, DAG, Subtarget); } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll index e4b8e9debad27..97e458e70565c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll @@ -242,33 +242,27 @@ define <64 x float> @interleave_v32f32(<32 x float> %x, <32 x float> %y) { ; V128-NEXT: slli a0, a0, 3 ; V128-NEXT: sub sp, sp, a0 ; V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; V128-NEXT: vmv8r.v v24, v16 -; V128-NEXT: vmv8r.v v16, v8 -; V128-NEXT: vmv8r.v v8, v24 ; V128-NEXT: addi a0, sp, 16 -; V128-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; V128-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; V128-NEXT: vslidedown.vi v0, v24, 16 -; V128-NEXT: li a0, -1 -; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; V128-NEXT: vwaddu.vv v24, v8, v0 -; V128-NEXT: vwmaccu.vx v24, a0, v0 -; V128-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; V128-NEXT: vslidedown.vi v0, v16, 16 +; V128-NEXT: vslidedown.vi v24, v16, 16 +; V128-NEXT: li a0, 32 +; V128-NEXT: vslidedown.vi v0, v8, 16 ; V128-NEXT: lui a1, 699051 -; V128-NEXT: li a2, 32 -; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; V128-NEXT: vwaddu.vv v8, v0, v16 +; V128-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; V128-NEXT: vzext.vf2 v8, v24 +; V128-NEXT: vzext.vf2 v24, v0 ; V128-NEXT: addi a1, a1, -1366 ; V128-NEXT: vmv.s.x v0, a1 -; V128-NEXT: vwmaccu.vx v8, a0, v16 -; V128-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; V128-NEXT: vmerge.vvm v24, v8, v24, v0 -; V128-NEXT: addi a1, sp, 16 -; V128-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; V128-NEXT: vsll.vx v8, v8, a0 +; V128-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; V128-NEXT: vmerge.vvm v24, v24, v8, v0 +; V128-NEXT: addi a0, sp, 16 +; V128-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; V128-NEXT: vwaddu.vv v0, v16, v8 -; V128-NEXT: vwmaccu.vx v0, a0, v8 +; V128-NEXT: vwaddu.vv v0, v8, v16 +; V128-NEXT: li a0, -1 +; V128-NEXT: vwmaccu.vx v0, a0, v16 ; V128-NEXT: vmv8r.v v8, v0 ; V128-NEXT: vmv8r.v v16, v24 ; V128-NEXT: csrr a0, vlenb diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll index 66af5718fb9dc..a8eb1f97fd1a2 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll @@ -186,35 +186,29 @@ define <4 x i32> @interleave_v4i32_offset_2(<4 x i32> %x, <4 x i32> %y) { define <4 x i32> @interleave_v4i32_offset_1(<4 x i32> %x, <4 x i32> %y) { ; V128-LABEL: interleave_v4i32_offset_1: ; V128: # %bb.0: -; V128-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; V128-NEXT: vwaddu.vv v10, v8, v8 -; V128-NEXT: li a0, -1 ; V128-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; V128-NEXT: vid.v v11 -; V128-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; V128-NEXT: vwmaccu.vx v10, a0, v8 -; V128-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; V128-NEXT: vsrl.vi v8, v11, 1 +; V128-NEXT: vid.v v10 ; V128-NEXT: vmv.v.i v0, 10 -; V128-NEXT: vadd.vi v8, v8, 1 -; V128-NEXT: vrgather.vv v10, v9, v8, v0.t +; V128-NEXT: vsrl.vi v10, v10, 1 +; V128-NEXT: vadd.vi v11, v10, 1 +; V128-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; V128-NEXT: vzext.vf2 v10, v8 +; V128-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; V128-NEXT: vrgather.vv v10, v9, v11, v0.t ; V128-NEXT: vmv.v.v v8, v10 ; V128-NEXT: ret ; ; V512-LABEL: interleave_v4i32_offset_1: ; V512: # %bb.0: -; V512-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; V512-NEXT: vwaddu.vv v10, v8, v8 -; V512-NEXT: li a0, -1 ; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, ma -; V512-NEXT: vid.v v11 -; V512-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; V512-NEXT: vwmaccu.vx v10, a0, v8 -; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, mu -; V512-NEXT: vsrl.vi v8, v11, 1 +; V512-NEXT: vid.v v10 ; V512-NEXT: vmv.v.i v0, 10 -; V512-NEXT: vadd.vi v8, v8, 1 -; V512-NEXT: vrgather.vv v10, v9, v8, v0.t +; V512-NEXT: vsrl.vi v10, v10, 1 +; V512-NEXT: vadd.vi v11, v10, 1 +; V512-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; V512-NEXT: vzext.vf2 v10, v8 +; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, mu +; V512-NEXT: vrgather.vv v10, v9, v11, v0.t ; V512-NEXT: vmv1r.v v8, v10 ; V512-NEXT: ret %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> @@ -411,33 +405,27 @@ define <64 x i32> @interleave_v32i32(<32 x i32> %x, <32 x i32> %y) { ; V128-NEXT: slli a0, a0, 3 ; V128-NEXT: sub sp, sp, a0 ; V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; V128-NEXT: vmv8r.v v24, v16 -; V128-NEXT: vmv8r.v v16, v8 -; V128-NEXT: vmv8r.v v8, v24 ; V128-NEXT: addi a0, sp, 16 -; V128-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill -; V128-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; V128-NEXT: vslidedown.vi v0, v24, 16 -; V128-NEXT: li a0, -1 -; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; V128-NEXT: vwaddu.vv v24, v8, v0 -; V128-NEXT: vwmaccu.vx v24, a0, v0 +; V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; V128-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; V128-NEXT: vslidedown.vi v0, v16, 16 +; V128-NEXT: vslidedown.vi v24, v16, 16 +; V128-NEXT: li a0, 32 +; V128-NEXT: vslidedown.vi v0, v8, 16 ; V128-NEXT: lui a1, 699051 -; V128-NEXT: li a2, 32 -; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; V128-NEXT: vwaddu.vv v8, v0, v16 +; V128-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; V128-NEXT: vzext.vf2 v8, v24 +; V128-NEXT: vzext.vf2 v24, v0 ; V128-NEXT: addi a1, a1, -1366 ; V128-NEXT: vmv.s.x v0, a1 -; V128-NEXT: vwmaccu.vx v8, a0, v16 -; V128-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; V128-NEXT: vmerge.vvm v24, v8, v24, v0 -; V128-NEXT: addi a1, sp, 16 -; V128-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; V128-NEXT: vsll.vx v8, v8, a0 +; V128-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; V128-NEXT: vmerge.vvm v24, v24, v8, v0 +; V128-NEXT: addi a0, sp, 16 +; V128-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; V128-NEXT: vwaddu.vv v0, v16, v8 -; V128-NEXT: vwmaccu.vx v0, a0, v8 +; V128-NEXT: vwaddu.vv v0, v8, v16 +; V128-NEXT: li a0, -1 +; V128-NEXT: vwmaccu.vx v0, a0, v16 ; V128-NEXT: vmv8r.v v8, v0 ; V128-NEXT: vmv8r.v v16, v24 ; V128-NEXT: csrr a0, vlenb diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll index 5d307211ead6e..a9ae2181333e8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll @@ -801,11 +801,9 @@ define <8 x i32> @shuffle_compress_singlesrc_gaps_e32(<8 x i32> %v) { define <8 x i32> @shuffle_spread2_singlesrc_e32(<8 x i32> %v) { ; CHECK-LABEL: shuffle_spread2_singlesrc_e32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vwaddu.vv v10, v8, v8 -; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: vwmaccu.vx v10, a0, v8 -; CHECK-NEXT: vmv2r.v v8, v10 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vmv.v.v v8, v10 ; CHECK-NEXT: ret %out = shufflevector <8 x i32> %v, <8 x i32> poison, <8 x i32> ret <8 x i32> %out @@ -814,11 +812,10 @@ define <8 x i32> @shuffle_spread2_singlesrc_e32(<8 x i32> %v) { define <8 x i32> @shuffle_spread2_singlesrc_e32_index1(<8 x i32> %v) { ; CHECK-LABEL: shuffle_spread2_singlesrc_e32_index1: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vwaddu.vv v10, v8, v8 -; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: vwmaccu.vx v10, a0, v8 -; CHECK-NEXT: vmv2r.v v8, v10 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsll.vx v8, v10, a0 ; CHECK-NEXT: ret %out = shufflevector <8 x i32> %v, <8 x i32> poison, <8 x i32> ret <8 x i32> %out