From f50d78772b3f39d8669c7b87fc8d776b682c8452 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 14 Feb 2025 15:05:23 -0800 Subject: [PATCH 1/2] [RISCV] Add fixed vector deinterleave tests with 2 sources. NFC --- .../rvv/fixed-vectors-shuffle-deinterleave.ll | 93 ++++ .../rvv/fixed-vectors-shufflevector-vnsrl.ll | 457 ++++++++++++++++++ 2 files changed, 550 insertions(+) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll index 10dadbc022e02..67b89f6e7cc5a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll @@ -369,3 +369,96 @@ entry: store <2 x i8> %shuffle.i5, ptr %out, align 1 ret void } + +define void @deinterleave4_0_i8_two_source(ptr %in0, ptr %in1, ptr %out) { +; CHECK-LABEL: deinterleave4_0_i8_two_source: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle8.v v9, (a1) +; CHECK-NEXT: vmv.v.i v0, 12 +; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: vsll.vi v10, v10, 2 +; CHECK-NEXT: vadd.vi v10, v10, -8 +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v8, 0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v8, 0 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t +; CHECK-NEXT: vse8.v v8, (a2) +; CHECK-NEXT: ret +entry: + %0 = load <8 x i8>, ptr %in0, align 1 + %1 = load <8 x i8>, ptr %in1, align 1 + %shuffle.i5 = shufflevector <8 x i8> %0, <8 x i8> %1, <8 x i32> + store <8 x i8> %shuffle.i5, ptr %out, align 1 + ret void +} + +define void @deinterleave4_8_i8_two_source(ptr %in0, ptr %in1, ptr %out) { +; CHECK-LABEL: deinterleave4_8_i8_two_source: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vle8.v v8, (a1) +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vi v10, v8, 4 +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-NEXT: vwaddu.vv v11, v8, v10 +; CHECK-NEXT: vwmaccu.vx v11, a0, v10 +; CHECK-NEXT: vmv.v.i v0, 12 +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v9, 8 +; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v8, 0 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vmerge.vvm v8, v8, v11, v0 +; CHECK-NEXT: vse8.v v8, (a2) +; CHECK-NEXT: ret +entry: + %0 = load <8 x i8>, ptr %in0, align 1 + %1 = load <8 x i8>, ptr %in1, align 1 + %shuffle.i5 = shufflevector <8 x i8> %0, <8 x i8> %1, <8 x i32> + store <8 x i8> %shuffle.i5, ptr %out, align 1 + ret void +} + +define void @deinterleave8_0_i8_two_source(ptr %in0, ptr %in1, ptr %out) { +; CHECK-LABEL: deinterleave8_0_i8_two_source: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 2, e8, mf2, ta, ma +; CHECK-NEXT: vle8.v v9, (a1) +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, tu, ma +; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vse8.v v8, (a2) +; CHECK-NEXT: ret +entry: + %0 = load <8 x i8>, ptr %in0, align 1 + %1 = load <8 x i8>, ptr %in1, align 1 + %shuffle.i5 = shufflevector <8 x i8> %0, <8 x i8> %1, <8 x i32> + store <8 x i8> %shuffle.i5, ptr %out, align 1 + ret void +} + +define void @deinterleave8_8_i8_two_source(ptr %in0, ptr %in1, ptr %out) { +; CHECK-LABEL: deinterleave8_8_i8_two_source: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle8.v v9, (a1) +; CHECK-NEXT: vmv.v.i v0, -3 +; CHECK-NEXT: vrgather.vi v9, v8, 1, v0.t +; CHECK-NEXT: vse8.v v9, (a2) +; CHECK-NEXT: ret +entry: + %0 = load <8 x i8>, ptr %in0, align 1 + %1 = load <8 x i8>, ptr %in1, align 1 + %shuffle.i5 = shufflevector <8 x i8> %0, <8 x i8> %1, <8 x i32> + store <8 x i8> %shuffle.i5, ptr %out, align 1 + ret void +} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll index 4e5ef9c002f1a..d0b8a94c56ffa 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll @@ -584,3 +584,460 @@ entry: store <64 x i32> %shuffle.i5, ptr %out, align 4 ret void } + +define void @vnsrl_0_i8_two_source(ptr %in0, ptr %in1, ptr %out) { +; V-LABEL: vnsrl_0_i8_two_source: +; V: # %bb.0: # %entry +; V-NEXT: vsetivli zero, 8, e8, mf4, ta, ma +; V-NEXT: vle8.v v8, (a0) +; V-NEXT: vle8.v v9, (a1) +; V-NEXT: vmv.v.i v0, -16 +; V-NEXT: vid.v v10 +; V-NEXT: vadd.vv v10, v10, v10 +; V-NEXT: vadd.vi v10, v10, -8 +; V-NEXT: vsetivli zero, 4, e8, mf8, ta, ma +; V-NEXT: vnsrl.wi v8, v8, 0 +; V-NEXT: vsetivli zero, 8, e8, mf4, ta, mu +; V-NEXT: vrgather.vv v8, v9, v10, v0.t +; V-NEXT: vse8.v v8, (a2) +; V-NEXT: ret +; +; ZVE32F-LABEL: vnsrl_0_i8_two_source: +; ZVE32F: # %bb.0: # %entry +; ZVE32F-NEXT: vsetivli zero, 8, e8, mf4, ta, ma +; ZVE32F-NEXT: vle8.v v8, (a0) +; ZVE32F-NEXT: vle8.v v9, (a1) +; ZVE32F-NEXT: vmv.v.i v0, -16 +; ZVE32F-NEXT: vid.v v10 +; ZVE32F-NEXT: vadd.vv v10, v10, v10 +; ZVE32F-NEXT: vadd.vi v10, v10, -8 +; ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; ZVE32F-NEXT: vnsrl.wi v8, v8, 0 +; ZVE32F-NEXT: vsetivli zero, 8, e8, mf4, ta, mu +; ZVE32F-NEXT: vrgather.vv v8, v9, v10, v0.t +; ZVE32F-NEXT: vse8.v v8, (a2) +; ZVE32F-NEXT: ret +entry: + %0 = load <8 x i8>, ptr %in0, align 1 + %1 = load <8 x i8>, ptr %in1, align 1 + %shuffle.i5 = shufflevector <8 x i8> %0, <8 x i8> %1, <8 x i32> + store <8 x i8> %shuffle.i5, ptr %out, align 1 + ret void +} + +define void @vnsrl_8_8_two_source(ptr %in0, ptr %in1, ptr %out) { +; V-LABEL: vnsrl_8_8_two_source: +; V: # %bb.0: # %entry +; V-NEXT: vsetivli zero, 8, e8, mf4, ta, ma +; V-NEXT: vle8.v v8, (a0) +; V-NEXT: vle8.v v9, (a1) +; V-NEXT: vmv.v.i v0, -16 +; V-NEXT: vid.v v10 +; V-NEXT: vadd.vv v10, v10, v10 +; V-NEXT: vadd.vi v10, v10, -7 +; V-NEXT: vsetivli zero, 4, e8, mf8, ta, ma +; V-NEXT: vnsrl.wi v8, v8, 8 +; V-NEXT: vsetivli zero, 8, e8, mf4, ta, mu +; V-NEXT: vrgather.vv v8, v9, v10, v0.t +; V-NEXT: vse8.v v8, (a2) +; V-NEXT: ret +; +; ZVE32F-LABEL: vnsrl_8_8_two_source: +; ZVE32F: # %bb.0: # %entry +; ZVE32F-NEXT: vsetivli zero, 8, e8, mf4, ta, ma +; ZVE32F-NEXT: vle8.v v8, (a0) +; ZVE32F-NEXT: vle8.v v9, (a1) +; ZVE32F-NEXT: vmv.v.i v0, -16 +; ZVE32F-NEXT: vid.v v10 +; ZVE32F-NEXT: vadd.vv v10, v10, v10 +; ZVE32F-NEXT: vadd.vi v10, v10, -7 +; ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; ZVE32F-NEXT: vnsrl.wi v8, v8, 8 +; ZVE32F-NEXT: vsetivli zero, 8, e8, mf4, ta, mu +; ZVE32F-NEXT: vrgather.vv v8, v9, v10, v0.t +; ZVE32F-NEXT: vse8.v v8, (a2) +; ZVE32F-NEXT: ret +entry: + %0 = load <8 x i8>, ptr %in0, align 1 + %1 = load <8 x i8>, ptr %in1, align 1 + %shuffle.i5 = shufflevector <8 x i8> %0, <8 x i8> %1, <8 x i32> + store <8 x i8> %shuffle.i5, ptr %out, align 1 + ret void +} + +define void @vnsrl_0_i16_two_source(ptr %in0, ptr %in1, ptr %out) { +; V-LABEL: vnsrl_0_i16_two_source: +; V: # %bb.0: # %entry +; V-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; V-NEXT: vle16.v v8, (a0) +; V-NEXT: vle16.v v9, (a1) +; V-NEXT: vid.v v10 +; V-NEXT: vadd.vv v10, v10, v10 +; V-NEXT: vadd.vi v10, v10, -4 +; V-NEXT: vmv.v.i v0, 12 +; V-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; V-NEXT: vnsrl.wi v8, v8, 0 +; V-NEXT: vsetivli zero, 4, e16, mf4, ta, mu +; V-NEXT: vrgather.vv v8, v9, v10, v0.t +; V-NEXT: vse16.v v8, (a2) +; V-NEXT: ret +; +; ZVE32F-LABEL: vnsrl_0_i16_two_source: +; ZVE32F: # %bb.0: # %entry +; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; ZVE32F-NEXT: vle16.v v8, (a0) +; ZVE32F-NEXT: vle16.v v9, (a1) +; ZVE32F-NEXT: vid.v v10 +; ZVE32F-NEXT: vadd.vv v10, v10, v10 +; ZVE32F-NEXT: vadd.vi v10, v10, -4 +; ZVE32F-NEXT: vmv.v.i v0, 12 +; ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; ZVE32F-NEXT: vnsrl.wi v8, v8, 0 +; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; ZVE32F-NEXT: vrgather.vv v8, v9, v10, v0.t +; ZVE32F-NEXT: vse16.v v8, (a2) +; ZVE32F-NEXT: ret +entry: + %0 = load <4 x i16>, ptr %in0, align 2 + %1 = load <4 x i16>, ptr %in1, align 2 + %shuffle.i5 = shufflevector <4 x i16> %0, <4 x i16> %1, <4 x i32> + store <4 x i16> %shuffle.i5, ptr %out, align 2 + ret void +} + +define void @vnsrl_16_i16_two_source(ptr %in0, ptr %in1, ptr %out) { +; V-LABEL: vnsrl_16_i16_two_source: +; V: # %bb.0: # %entry +; V-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; V-NEXT: vle16.v v8, (a1) +; V-NEXT: vle16.v v9, (a0) +; V-NEXT: li a0, -1 +; V-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; V-NEXT: vslidedown.vi v10, v8, 2 +; V-NEXT: vwaddu.vv v11, v8, v10 +; V-NEXT: vwmaccu.vx v11, a0, v10 +; V-NEXT: vmv.v.i v0, 12 +; V-NEXT: vnsrl.wi v8, v9, 16 +; V-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; V-NEXT: vmerge.vvm v8, v8, v11, v0 +; V-NEXT: vse16.v v8, (a2) +; V-NEXT: ret +; +; ZVE32F-LABEL: vnsrl_16_i16_two_source: +; ZVE32F: # %bb.0: # %entry +; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; ZVE32F-NEXT: vle16.v v8, (a1) +; ZVE32F-NEXT: vle16.v v9, (a0) +; ZVE32F-NEXT: li a0, -1 +; ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; ZVE32F-NEXT: vwaddu.vv v11, v8, v10 +; ZVE32F-NEXT: vwmaccu.vx v11, a0, v10 +; ZVE32F-NEXT: vmv.v.i v0, 12 +; ZVE32F-NEXT: vnsrl.wi v8, v9, 16 +; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; ZVE32F-NEXT: vmerge.vvm v8, v8, v11, v0 +; ZVE32F-NEXT: vse16.v v8, (a2) +; ZVE32F-NEXT: ret +entry: + %0 = load <4 x i16>, ptr %in0, align 2 + %1 = load <4 x i16>, ptr %in1, align 2 + %shuffle.i5 = shufflevector <4 x i16> %0, <4 x i16> %1, <4 x i32> + store <4 x i16> %shuffle.i5, ptr %out, align 2 + ret void +} + +define void @vnsrl_0_half_two_source(ptr %in0, ptr %in1, ptr %out) { +; V-LABEL: vnsrl_0_half_two_source: +; V: # %bb.0: # %entry +; V-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; V-NEXT: vle16.v v8, (a0) +; V-NEXT: vle16.v v9, (a1) +; V-NEXT: vid.v v10 +; V-NEXT: vadd.vv v10, v10, v10 +; V-NEXT: vadd.vi v10, v10, -4 +; V-NEXT: vmv.v.i v0, 12 +; V-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; V-NEXT: vnsrl.wi v8, v8, 0 +; V-NEXT: vsetivli zero, 4, e16, mf4, ta, mu +; V-NEXT: vrgather.vv v8, v9, v10, v0.t +; V-NEXT: vse16.v v8, (a2) +; V-NEXT: ret +; +; ZVE32F-LABEL: vnsrl_0_half_two_source: +; ZVE32F: # %bb.0: # %entry +; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; ZVE32F-NEXT: vle16.v v8, (a0) +; ZVE32F-NEXT: vle16.v v9, (a1) +; ZVE32F-NEXT: vid.v v10 +; ZVE32F-NEXT: vadd.vv v10, v10, v10 +; ZVE32F-NEXT: vadd.vi v10, v10, -4 +; ZVE32F-NEXT: vmv.v.i v0, 12 +; ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; ZVE32F-NEXT: vnsrl.wi v8, v8, 0 +; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; ZVE32F-NEXT: vrgather.vv v8, v9, v10, v0.t +; ZVE32F-NEXT: vse16.v v8, (a2) +; ZVE32F-NEXT: ret +entry: + %0 = load <4 x half>, ptr %in0, align 2 + %1 = load <4 x half>, ptr %in1, align 2 + %shuffle.i5 = shufflevector <4 x half> %0, <4 x half> %1, <4 x i32> + store <4 x half> %shuffle.i5, ptr %out, align 2 + ret void +} + +define void @vnsrl_16_half_two_source(ptr %in0, ptr %in1, ptr %out) { +; V-LABEL: vnsrl_16_half_two_source: +; V: # %bb.0: # %entry +; V-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; V-NEXT: vle16.v v8, (a1) +; V-NEXT: vle16.v v9, (a0) +; V-NEXT: li a0, -1 +; V-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; V-NEXT: vslidedown.vi v10, v8, 2 +; V-NEXT: vwaddu.vv v11, v8, v10 +; V-NEXT: vwmaccu.vx v11, a0, v10 +; V-NEXT: vmv.v.i v0, 12 +; V-NEXT: vnsrl.wi v8, v9, 16 +; V-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; V-NEXT: vmerge.vvm v8, v8, v11, v0 +; V-NEXT: vse16.v v8, (a2) +; V-NEXT: ret +; +; ZVE32F-LABEL: vnsrl_16_half_two_source: +; ZVE32F: # %bb.0: # %entry +; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; ZVE32F-NEXT: vle16.v v8, (a1) +; ZVE32F-NEXT: vle16.v v9, (a0) +; ZVE32F-NEXT: li a0, -1 +; ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; ZVE32F-NEXT: vwaddu.vv v11, v8, v10 +; ZVE32F-NEXT: vwmaccu.vx v11, a0, v10 +; ZVE32F-NEXT: vmv.v.i v0, 12 +; ZVE32F-NEXT: vnsrl.wi v8, v9, 16 +; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; ZVE32F-NEXT: vmerge.vvm v8, v8, v11, v0 +; ZVE32F-NEXT: vse16.v v8, (a2) +; ZVE32F-NEXT: ret +entry: + %0 = load <4 x half>, ptr %in0, align 2 + %1 = load <4 x half>, ptr %in1, align 2 + %shuffle.i5 = shufflevector <4 x half> %0, <4 x half> %1, <4 x i32> + store <4 x half> %shuffle.i5, ptr %out, align 2 + ret void +} + +define void @vnsrl_0_i32_two_source(ptr %in0, ptr %in1, ptr %out) { +; V-LABEL: vnsrl_0_i32_two_source: +; V: # %bb.0: # %entry +; V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; V-NEXT: vle32.v v8, (a0) +; V-NEXT: vle32.v v9, (a1) +; V-NEXT: vslideup.vi v8, v9, 1 +; V-NEXT: vse32.v v8, (a2) +; V-NEXT: ret +; +; ZVE32F-LABEL: vnsrl_0_i32_two_source: +; ZVE32F: # %bb.0: # %entry +; ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma +; ZVE32F-NEXT: vle32.v v8, (a0) +; ZVE32F-NEXT: vle32.v v9, (a1) +; ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; ZVE32F-NEXT: vse32.v v8, (a2) +; ZVE32F-NEXT: ret +entry: + %0 = load <2 x i32>, ptr %in0, align 4 + %1 = load <2 x i32>, ptr %in1, align 4 + %shuffle.i5 = shufflevector <2 x i32> %0, <2 x i32> %1, <2 x i32> + store <2 x i32> %shuffle.i5, ptr %out, align 4 + ret void +} + +define void @vnsrl_32_i32_two_source(ptr %in0, ptr %in1, ptr %out) { +; V-LABEL: vnsrl_32_i32_two_source: +; V: # %bb.0: # %entry +; V-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; V-NEXT: vle32.v v8, (a0) +; V-NEXT: vle32.v v9, (a1) +; V-NEXT: vmv.v.i v0, 1 +; V-NEXT: vrgather.vi v9, v8, 1, v0.t +; V-NEXT: vse32.v v9, (a2) +; V-NEXT: ret +; +; ZVE32F-LABEL: vnsrl_32_i32_two_source: +; ZVE32F: # %bb.0: # %entry +; ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; ZVE32F-NEXT: vle32.v v8, (a0) +; ZVE32F-NEXT: vle32.v v9, (a1) +; ZVE32F-NEXT: vmv.v.i v0, 1 +; ZVE32F-NEXT: vrgather.vi v9, v8, 1, v0.t +; ZVE32F-NEXT: vse32.v v9, (a2) +; ZVE32F-NEXT: ret +entry: + %0 = load <2 x i32>, ptr %in0, align 4 + %1 = load <2 x i32>, ptr %in1, align 4 + %shuffle.i5 = shufflevector <2 x i32> %0, <2 x i32> %1, <2 x i32> + store <2 x i32> %shuffle.i5, ptr %out, align 4 + ret void +} + +define void @vnsrl_0_float_two_source(ptr %in0, ptr %in1, ptr %out) { +; V-LABEL: vnsrl_0_float_two_source: +; V: # %bb.0: # %entry +; V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; V-NEXT: vle32.v v8, (a0) +; V-NEXT: vle32.v v9, (a1) +; V-NEXT: vslideup.vi v8, v9, 1 +; V-NEXT: vse32.v v8, (a2) +; V-NEXT: ret +; +; ZVE32F-LABEL: vnsrl_0_float_two_source: +; ZVE32F: # %bb.0: # %entry +; ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma +; ZVE32F-NEXT: vle32.v v8, (a0) +; ZVE32F-NEXT: vle32.v v9, (a1) +; ZVE32F-NEXT: vslideup.vi v8, v9, 1 +; ZVE32F-NEXT: vse32.v v8, (a2) +; ZVE32F-NEXT: ret +entry: + %0 = load <2 x float>, ptr %in0, align 4 + %1 = load <2 x float>, ptr %in1, align 4 + %shuffle.i5 = shufflevector <2 x float> %0, <2 x float> %1, <2 x i32> + store <2 x float> %shuffle.i5, ptr %out, align 4 + ret void +} + +define void @vnsrl_32_float_two_source(ptr %in0, ptr %in1, ptr %out) { +; V-LABEL: vnsrl_32_float_two_source: +; V: # %bb.0: # %entry +; V-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; V-NEXT: vle32.v v8, (a0) +; V-NEXT: vle32.v v9, (a1) +; V-NEXT: vmv.v.i v0, 1 +; V-NEXT: vrgather.vi v9, v8, 1, v0.t +; V-NEXT: vse32.v v9, (a2) +; V-NEXT: ret +; +; ZVE32F-LABEL: vnsrl_32_float_two_source: +; ZVE32F: # %bb.0: # %entry +; ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; ZVE32F-NEXT: vle32.v v8, (a0) +; ZVE32F-NEXT: vle32.v v9, (a1) +; ZVE32F-NEXT: vmv.v.i v0, 1 +; ZVE32F-NEXT: vrgather.vi v9, v8, 1, v0.t +; ZVE32F-NEXT: vse32.v v9, (a2) +; ZVE32F-NEXT: ret +entry: + %0 = load <2 x float>, ptr %in0, align 4 + %1 = load <2 x float>, ptr %in1, align 4 + %shuffle.i5 = shufflevector <2 x float> %0, <2 x float> %1, <2 x i32> + store <2 x float> %shuffle.i5, ptr %out, align 4 + ret void +} + +define void @vnsrl_0_i64_two_source(ptr %in0, ptr %in1, ptr %out) { +; V-LABEL: vnsrl_0_i64_two_source: +; V: # %bb.0: # %entry +; V-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; V-NEXT: vle64.v v8, (a0) +; V-NEXT: vle64.v v9, (a1) +; V-NEXT: vslideup.vi v8, v9, 1 +; V-NEXT: vse64.v v8, (a2) +; V-NEXT: ret +; +; ZVE32F-LABEL: vnsrl_0_i64_two_source: +; ZVE32F: # %bb.0: # %entry +; ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; ZVE32F-NEXT: vle32.v v8, (a0) +; ZVE32F-NEXT: vle32.v v9, (a1) +; ZVE32F-NEXT: vslideup.vi v8, v9, 2 +; ZVE32F-NEXT: vse32.v v8, (a2) +; ZVE32F-NEXT: ret +entry: + %0 = load <2 x i64>, ptr %in0, align 8 + %1 = load <2 x i64>, ptr %in1, align 8 + %shuffle.i5 = shufflevector <2 x i64> %0, <2 x i64> %1, <2 x i32> + store <2 x i64> %shuffle.i5, ptr %out, align 8 + ret void +} + +define void @vnsrl_64_i64_two_source(ptr %in0, ptr %in1, ptr %out) { +; V-LABEL: vnsrl_64_i64_two_source: +; V: # %bb.0: # %entry +; V-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; V-NEXT: vle64.v v8, (a0) +; V-NEXT: vle64.v v9, (a1) +; V-NEXT: vmv.v.i v0, 1 +; V-NEXT: vrgather.vi v9, v8, 1, v0.t +; V-NEXT: vse64.v v9, (a2) +; V-NEXT: ret +; +; ZVE32F-LABEL: vnsrl_64_i64_two_source: +; ZVE32F: # %bb.0: # %entry +; ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; ZVE32F-NEXT: vle32.v v8, (a0) +; ZVE32F-NEXT: vle32.v v9, (a1) +; ZVE32F-NEXT: vmv.v.i v0, 3 +; ZVE32F-NEXT: vslidedown.vi v9, v8, 2, v0.t +; ZVE32F-NEXT: vse32.v v9, (a2) +; ZVE32F-NEXT: ret +entry: + %0 = load <2 x i64>, ptr %in0, align 8 + %1 = load <2 x i64>, ptr %in1, align 8 + %shuffle.i5 = shufflevector <2 x i64> %0, <2 x i64> %1, <2 x i32> + store <2 x i64> %shuffle.i5, ptr %out, align 8 + ret void +} + +define void @vnsrl_0_double_two_source(ptr %in0, ptr %in1, ptr %out) { +; V-LABEL: vnsrl_0_double_two_source: +; V: # %bb.0: # %entry +; V-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; V-NEXT: vle64.v v8, (a0) +; V-NEXT: vle64.v v9, (a1) +; V-NEXT: vslideup.vi v8, v9, 1 +; V-NEXT: vse64.v v8, (a2) +; V-NEXT: ret +; +; ZVE32F-LABEL: vnsrl_0_double_two_source: +; ZVE32F: # %bb.0: # %entry +; ZVE32F-NEXT: ld a0, 0(a0) +; ZVE32F-NEXT: ld a1, 0(a1) +; ZVE32F-NEXT: sd a0, 0(a2) +; ZVE32F-NEXT: sd a1, 8(a2) +; ZVE32F-NEXT: ret +entry: + %0 = load <2 x double>, ptr %in0, align 8 + %1 = load <2 x double>, ptr %in1, align 8 + %shuffle.i5 = shufflevector <2 x double> %0, <2 x double> %1, <2 x i32> + store <2 x double> %shuffle.i5, ptr %out, align 8 + ret void +} + +define void @vnsrl_64_double_two_source(ptr %in0, ptr %in1, ptr %out) { +; V-LABEL: vnsrl_64_double_two_source: +; V: # %bb.0: # %entry +; V-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; V-NEXT: vle64.v v8, (a0) +; V-NEXT: vle64.v v9, (a1) +; V-NEXT: vmv.v.i v0, 1 +; V-NEXT: vrgather.vi v9, v8, 1, v0.t +; V-NEXT: vse64.v v9, (a2) +; V-NEXT: ret +; +; ZVE32F-LABEL: vnsrl_64_double_two_source: +; ZVE32F: # %bb.0: # %entry +; ZVE32F-NEXT: ld a0, 8(a0) +; ZVE32F-NEXT: ld a1, 8(a1) +; ZVE32F-NEXT: sd a0, 0(a2) +; ZVE32F-NEXT: sd a1, 8(a2) +; ZVE32F-NEXT: ret +entry: + %0 = load <2 x double>, ptr %in0, align 8 + %1 = load <2 x double>, ptr %in1, align 8 + %shuffle.i5 = shufflevector <2 x double> %0, <2 x double> %1, <2 x i32> + store <2 x double> %shuffle.i5, ptr %out, align 8 + ret void +} From 0cfa4ec30987d93b7af1d281ebcfa387fcdbdc08 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 14 Feb 2025 15:15:06 -0800 Subject: [PATCH 2/2] [RISCV] Recognize de-interleave shuffles with 2 sources. We can use vnsrl+trunc on each source and concatenate the results with vslideup. For low LMUL it would be better to concat first, but I'm leaving this for later. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 14 ++ .../rvv/fixed-vectors-deinterleave-load.ll | 34 ++-- .../rvv/fixed-vectors-shuffle-deinterleave.ll | 31 ++-- .../rvv/fixed-vectors-shufflevector-vnsrl.ll | 168 +++++++----------- .../RISCV/rvv/vector-deinterleave-fixed.ll | 33 ++-- 5 files changed, 116 insertions(+), 164 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 43d32987da95a..c40ab0d09bdf6 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -5593,6 +5593,20 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, 1 < count_if(Mask, [](int Idx) { return Idx != -1; })) { if (SDValue Src = getSingleShuffleSrc(VT, V1, V2)) return getDeinterleaveShiftAndTrunc(DL, VT, Src, Factor, Index, DAG); + if (1 < count_if(Mask, + [&Mask](int Idx) { return Idx < (int)Mask.size(); }) && + 1 < count_if(Mask, [&Mask](int Idx) { + return Idx >= (int)Mask.size(); + })) { + // Narrow each source and concatenate them. + // FIXME: For small LMUL it is better to concatenate first. + MVT HalfVT = VT.getHalfNumVectorElementsVT(); + SDValue Lo = + getDeinterleaveShiftAndTrunc(DL, HalfVT, V1, Factor, Index, DAG); + SDValue Hi = + getDeinterleaveShiftAndTrunc(DL, HalfVT, V2, Factor, Index, DAG); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); + } } } } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll index b4634dbf5a5e8..e53dfc23a84bb 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll @@ -10,34 +10,26 @@ define {<16 x i1>, <16 x i1>} @vector_deinterleave_load_v16i1_v32i1(ptr %p) { ; CHECK-LABEL: vector_deinterleave_load_v16i1_v32i1: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vmv.v.i v10, 0 -; CHECK-NEXT: vid.v v9 ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-NEXT: vlm.v v8, (a0) -; CHECK-NEXT: li a0, -256 +; CHECK-NEXT: vlm.v v0, (a0) ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vadd.vv v11, v9, v9 -; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vmv.s.x v9, a0 -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma -; CHECK-NEXT: vadd.vi v12, v11, -16 +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmerge.vim v9, v8, 1, v0 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vi v0, v8, 2 +; CHECK-NEXT: vslidedown.vi v0, v0, 2 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vnsrl.wi v10, v9, 0 ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vadd.vi v11, v11, -15 -; CHECK-NEXT: vmerge.vim v13, v10, 1, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vim v8, v10, 1, v0 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vnsrl.wi v10, v8, 0 +; CHECK-NEXT: vnsrl.wi v9, v9, 8 +; CHECK-NEXT: vnsrl.wi v11, v8, 0 ; CHECK-NEXT: vnsrl.wi v8, v8, 8 -; CHECK-NEXT: vmv1r.v v0, v9 -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu -; CHECK-NEXT: vrgather.vv v10, v13, v12, v0.t -; CHECK-NEXT: vrgather.vv v8, v13, v11, v0.t +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-NEXT: vslideup.vi v10, v11, 8 +; CHECK-NEXT: vslideup.vi v9, v8, 8 ; CHECK-NEXT: vmsne.vi v0, v10, 0 -; CHECK-NEXT: vmsne.vi v8, v8, 0 +; CHECK-NEXT: vmsne.vi v8, v9, 0 ; CHECK-NEXT: ret %vec = load <32 x i1>, ptr %p %deinterleaved.results = call {<16 x i1>, <16 x i1>} @llvm.vector.deinterleave2.v32i1(<32 x i1> %vec) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll index 67b89f6e7cc5a..ad18c801069f4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll @@ -374,19 +374,17 @@ define void @deinterleave4_0_i8_two_source(ptr %in0, ptr %in1, ptr %out) { ; CHECK-LABEL: deinterleave4_0_i8_two_source: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vle8.v v9, (a1) -; CHECK-NEXT: vmv.v.i v0, 12 -; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: vsll.vi v10, v10, 2 -; CHECK-NEXT: vadd.vi v10, v10, -8 +; CHECK-NEXT: vle8.v v8, (a1) +; CHECK-NEXT: vle8.v v9, (a0) ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vnsrl.wi v8, v8, 0 +; CHECK-NEXT: vnsrl.wi v9, v9, 0 ; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, ma ; CHECK-NEXT: vnsrl.wi v8, v8, 0 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t -; CHECK-NEXT: vse8.v v8, (a2) +; CHECK-NEXT: vnsrl.wi v9, v9, 0 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vslideup.vi v9, v8, 4 +; CHECK-NEXT: vse8.v v9, (a2) ; CHECK-NEXT: ret entry: %0 = load <8 x i8>, ptr %in0, align 1 @@ -402,20 +400,15 @@ define void @deinterleave4_8_i8_two_source(ptr %in0, ptr %in1, ptr %out) { ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vle8.v v8, (a1) ; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vi v10, v8, 4 -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vwaddu.vv v11, v8, v10 -; CHECK-NEXT: vwmaccu.vx v11, a0, v10 -; CHECK-NEXT: vmv.v.i v0, 12 ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vnsrl.wi v8, v9, 8 +; CHECK-NEXT: vnsrl.wi v8, v8, 8 +; CHECK-NEXT: vnsrl.wi v9, v9, 8 ; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, ma ; CHECK-NEXT: vnsrl.wi v8, v8, 0 +; CHECK-NEXT: vnsrl.wi v9, v9, 0 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmerge.vvm v8, v8, v11, v0 -; CHECK-NEXT: vse8.v v8, (a2) +; CHECK-NEXT: vslideup.vi v9, v8, 4 +; CHECK-NEXT: vse8.v v9, (a2) ; CHECK-NEXT: ret entry: %0 = load <8 x i8>, ptr %in0, align 1 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll index d0b8a94c56ffa..180579e47d075 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll @@ -589,33 +589,27 @@ define void @vnsrl_0_i8_two_source(ptr %in0, ptr %in1, ptr %out) { ; V-LABEL: vnsrl_0_i8_two_source: ; V: # %bb.0: # %entry ; V-NEXT: vsetivli zero, 8, e8, mf4, ta, ma -; V-NEXT: vle8.v v8, (a0) -; V-NEXT: vle8.v v9, (a1) -; V-NEXT: vmv.v.i v0, -16 -; V-NEXT: vid.v v10 -; V-NEXT: vadd.vv v10, v10, v10 -; V-NEXT: vadd.vi v10, v10, -8 +; V-NEXT: vle8.v v8, (a1) +; V-NEXT: vle8.v v9, (a0) ; V-NEXT: vsetivli zero, 4, e8, mf8, ta, ma ; V-NEXT: vnsrl.wi v8, v8, 0 -; V-NEXT: vsetivli zero, 8, e8, mf4, ta, mu -; V-NEXT: vrgather.vv v8, v9, v10, v0.t -; V-NEXT: vse8.v v8, (a2) +; V-NEXT: vnsrl.wi v9, v9, 0 +; V-NEXT: vsetivli zero, 8, e8, mf4, ta, ma +; V-NEXT: vslideup.vi v9, v8, 4 +; V-NEXT: vse8.v v9, (a2) ; V-NEXT: ret ; ; ZVE32F-LABEL: vnsrl_0_i8_two_source: ; ZVE32F: # %bb.0: # %entry ; ZVE32F-NEXT: vsetivli zero, 8, e8, mf4, ta, ma -; ZVE32F-NEXT: vle8.v v8, (a0) -; ZVE32F-NEXT: vle8.v v9, (a1) -; ZVE32F-NEXT: vmv.v.i v0, -16 -; ZVE32F-NEXT: vid.v v10 -; ZVE32F-NEXT: vadd.vv v10, v10, v10 -; ZVE32F-NEXT: vadd.vi v10, v10, -8 +; ZVE32F-NEXT: vle8.v v8, (a1) +; ZVE32F-NEXT: vle8.v v9, (a0) ; ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; ZVE32F-NEXT: vnsrl.wi v8, v8, 0 -; ZVE32F-NEXT: vsetivli zero, 8, e8, mf4, ta, mu -; ZVE32F-NEXT: vrgather.vv v8, v9, v10, v0.t -; ZVE32F-NEXT: vse8.v v8, (a2) +; ZVE32F-NEXT: vnsrl.wi v9, v9, 0 +; ZVE32F-NEXT: vsetivli zero, 8, e8, mf4, ta, ma +; ZVE32F-NEXT: vslideup.vi v9, v8, 4 +; ZVE32F-NEXT: vse8.v v9, (a2) ; ZVE32F-NEXT: ret entry: %0 = load <8 x i8>, ptr %in0, align 1 @@ -629,33 +623,27 @@ define void @vnsrl_8_8_two_source(ptr %in0, ptr %in1, ptr %out) { ; V-LABEL: vnsrl_8_8_two_source: ; V: # %bb.0: # %entry ; V-NEXT: vsetivli zero, 8, e8, mf4, ta, ma -; V-NEXT: vle8.v v8, (a0) -; V-NEXT: vle8.v v9, (a1) -; V-NEXT: vmv.v.i v0, -16 -; V-NEXT: vid.v v10 -; V-NEXT: vadd.vv v10, v10, v10 -; V-NEXT: vadd.vi v10, v10, -7 +; V-NEXT: vle8.v v8, (a1) +; V-NEXT: vle8.v v9, (a0) ; V-NEXT: vsetivli zero, 4, e8, mf8, ta, ma ; V-NEXT: vnsrl.wi v8, v8, 8 -; V-NEXT: vsetivli zero, 8, e8, mf4, ta, mu -; V-NEXT: vrgather.vv v8, v9, v10, v0.t -; V-NEXT: vse8.v v8, (a2) +; V-NEXT: vnsrl.wi v9, v9, 8 +; V-NEXT: vsetivli zero, 8, e8, mf4, ta, ma +; V-NEXT: vslideup.vi v9, v8, 4 +; V-NEXT: vse8.v v9, (a2) ; V-NEXT: ret ; ; ZVE32F-LABEL: vnsrl_8_8_two_source: ; ZVE32F: # %bb.0: # %entry ; ZVE32F-NEXT: vsetivli zero, 8, e8, mf4, ta, ma -; ZVE32F-NEXT: vle8.v v8, (a0) -; ZVE32F-NEXT: vle8.v v9, (a1) -; ZVE32F-NEXT: vmv.v.i v0, -16 -; ZVE32F-NEXT: vid.v v10 -; ZVE32F-NEXT: vadd.vv v10, v10, v10 -; ZVE32F-NEXT: vadd.vi v10, v10, -7 +; ZVE32F-NEXT: vle8.v v8, (a1) +; ZVE32F-NEXT: vle8.v v9, (a0) ; ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; ZVE32F-NEXT: vnsrl.wi v8, v8, 8 -; ZVE32F-NEXT: vsetivli zero, 8, e8, mf4, ta, mu -; ZVE32F-NEXT: vrgather.vv v8, v9, v10, v0.t -; ZVE32F-NEXT: vse8.v v8, (a2) +; ZVE32F-NEXT: vnsrl.wi v9, v9, 8 +; ZVE32F-NEXT: vsetivli zero, 8, e8, mf4, ta, ma +; ZVE32F-NEXT: vslideup.vi v9, v8, 4 +; ZVE32F-NEXT: vse8.v v9, (a2) ; ZVE32F-NEXT: ret entry: %0 = load <8 x i8>, ptr %in0, align 1 @@ -669,33 +657,27 @@ define void @vnsrl_0_i16_two_source(ptr %in0, ptr %in1, ptr %out) { ; V-LABEL: vnsrl_0_i16_two_source: ; V: # %bb.0: # %entry ; V-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; V-NEXT: vle16.v v8, (a0) -; V-NEXT: vle16.v v9, (a1) -; V-NEXT: vid.v v10 -; V-NEXT: vadd.vv v10, v10, v10 -; V-NEXT: vadd.vi v10, v10, -4 -; V-NEXT: vmv.v.i v0, 12 +; V-NEXT: vle16.v v8, (a1) +; V-NEXT: vle16.v v9, (a0) ; V-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; V-NEXT: vnsrl.wi v8, v8, 0 -; V-NEXT: vsetivli zero, 4, e16, mf4, ta, mu -; V-NEXT: vrgather.vv v8, v9, v10, v0.t -; V-NEXT: vse16.v v8, (a2) +; V-NEXT: vnsrl.wi v9, v9, 0 +; V-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; V-NEXT: vslideup.vi v9, v8, 2 +; V-NEXT: vse16.v v9, (a2) ; V-NEXT: ret ; ; ZVE32F-LABEL: vnsrl_0_i16_two_source: ; ZVE32F: # %bb.0: # %entry ; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZVE32F-NEXT: vle16.v v8, (a0) -; ZVE32F-NEXT: vle16.v v9, (a1) -; ZVE32F-NEXT: vid.v v10 -; ZVE32F-NEXT: vadd.vv v10, v10, v10 -; ZVE32F-NEXT: vadd.vi v10, v10, -4 -; ZVE32F-NEXT: vmv.v.i v0, 12 +; ZVE32F-NEXT: vle16.v v8, (a1) +; ZVE32F-NEXT: vle16.v v9, (a0) ; ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; ZVE32F-NEXT: vnsrl.wi v8, v8, 0 -; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; ZVE32F-NEXT: vrgather.vv v8, v9, v10, v0.t -; ZVE32F-NEXT: vse16.v v8, (a2) +; ZVE32F-NEXT: vnsrl.wi v9, v9, 0 +; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; ZVE32F-NEXT: vslideup.vi v9, v8, 2 +; ZVE32F-NEXT: vse16.v v9, (a2) ; ZVE32F-NEXT: ret entry: %0 = load <4 x i16>, ptr %in0, align 2 @@ -711,16 +693,12 @@ define void @vnsrl_16_i16_two_source(ptr %in0, ptr %in1, ptr %out) { ; V-NEXT: vsetivli zero, 4, e16, mf4, ta, ma ; V-NEXT: vle16.v v8, (a1) ; V-NEXT: vle16.v v9, (a0) -; V-NEXT: li a0, -1 ; V-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; V-NEXT: vslidedown.vi v10, v8, 2 -; V-NEXT: vwaddu.vv v11, v8, v10 -; V-NEXT: vwmaccu.vx v11, a0, v10 -; V-NEXT: vmv.v.i v0, 12 -; V-NEXT: vnsrl.wi v8, v9, 16 +; V-NEXT: vnsrl.wi v8, v8, 16 +; V-NEXT: vnsrl.wi v9, v9, 16 ; V-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; V-NEXT: vmerge.vvm v8, v8, v11, v0 -; V-NEXT: vse16.v v8, (a2) +; V-NEXT: vslideup.vi v9, v8, 2 +; V-NEXT: vse16.v v9, (a2) ; V-NEXT: ret ; ; ZVE32F-LABEL: vnsrl_16_i16_two_source: @@ -728,16 +706,12 @@ define void @vnsrl_16_i16_two_source(ptr %in0, ptr %in1, ptr %out) { ; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVE32F-NEXT: vle16.v v8, (a1) ; ZVE32F-NEXT: vle16.v v9, (a0) -; ZVE32F-NEXT: li a0, -1 ; ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; ZVE32F-NEXT: vslidedown.vi v10, v8, 2 -; ZVE32F-NEXT: vwaddu.vv v11, v8, v10 -; ZVE32F-NEXT: vwmaccu.vx v11, a0, v10 -; ZVE32F-NEXT: vmv.v.i v0, 12 -; ZVE32F-NEXT: vnsrl.wi v8, v9, 16 +; ZVE32F-NEXT: vnsrl.wi v8, v8, 16 +; ZVE32F-NEXT: vnsrl.wi v9, v9, 16 ; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZVE32F-NEXT: vmerge.vvm v8, v8, v11, v0 -; ZVE32F-NEXT: vse16.v v8, (a2) +; ZVE32F-NEXT: vslideup.vi v9, v8, 2 +; ZVE32F-NEXT: vse16.v v9, (a2) ; ZVE32F-NEXT: ret entry: %0 = load <4 x i16>, ptr %in0, align 2 @@ -751,33 +725,27 @@ define void @vnsrl_0_half_two_source(ptr %in0, ptr %in1, ptr %out) { ; V-LABEL: vnsrl_0_half_two_source: ; V: # %bb.0: # %entry ; V-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; V-NEXT: vle16.v v8, (a0) -; V-NEXT: vle16.v v9, (a1) -; V-NEXT: vid.v v10 -; V-NEXT: vadd.vv v10, v10, v10 -; V-NEXT: vadd.vi v10, v10, -4 -; V-NEXT: vmv.v.i v0, 12 +; V-NEXT: vle16.v v8, (a1) +; V-NEXT: vle16.v v9, (a0) ; V-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; V-NEXT: vnsrl.wi v8, v8, 0 -; V-NEXT: vsetivli zero, 4, e16, mf4, ta, mu -; V-NEXT: vrgather.vv v8, v9, v10, v0.t -; V-NEXT: vse16.v v8, (a2) +; V-NEXT: vnsrl.wi v9, v9, 0 +; V-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; V-NEXT: vslideup.vi v9, v8, 2 +; V-NEXT: vse16.v v9, (a2) ; V-NEXT: ret ; ; ZVE32F-LABEL: vnsrl_0_half_two_source: ; ZVE32F: # %bb.0: # %entry ; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZVE32F-NEXT: vle16.v v8, (a0) -; ZVE32F-NEXT: vle16.v v9, (a1) -; ZVE32F-NEXT: vid.v v10 -; ZVE32F-NEXT: vadd.vv v10, v10, v10 -; ZVE32F-NEXT: vadd.vi v10, v10, -4 -; ZVE32F-NEXT: vmv.v.i v0, 12 +; ZVE32F-NEXT: vle16.v v8, (a1) +; ZVE32F-NEXT: vle16.v v9, (a0) ; ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; ZVE32F-NEXT: vnsrl.wi v8, v8, 0 -; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; ZVE32F-NEXT: vrgather.vv v8, v9, v10, v0.t -; ZVE32F-NEXT: vse16.v v8, (a2) +; ZVE32F-NEXT: vnsrl.wi v9, v9, 0 +; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; ZVE32F-NEXT: vslideup.vi v9, v8, 2 +; ZVE32F-NEXT: vse16.v v9, (a2) ; ZVE32F-NEXT: ret entry: %0 = load <4 x half>, ptr %in0, align 2 @@ -793,16 +761,12 @@ define void @vnsrl_16_half_two_source(ptr %in0, ptr %in1, ptr %out) { ; V-NEXT: vsetivli zero, 4, e16, mf4, ta, ma ; V-NEXT: vle16.v v8, (a1) ; V-NEXT: vle16.v v9, (a0) -; V-NEXT: li a0, -1 ; V-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; V-NEXT: vslidedown.vi v10, v8, 2 -; V-NEXT: vwaddu.vv v11, v8, v10 -; V-NEXT: vwmaccu.vx v11, a0, v10 -; V-NEXT: vmv.v.i v0, 12 -; V-NEXT: vnsrl.wi v8, v9, 16 +; V-NEXT: vnsrl.wi v8, v8, 16 +; V-NEXT: vnsrl.wi v9, v9, 16 ; V-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; V-NEXT: vmerge.vvm v8, v8, v11, v0 -; V-NEXT: vse16.v v8, (a2) +; V-NEXT: vslideup.vi v9, v8, 2 +; V-NEXT: vse16.v v9, (a2) ; V-NEXT: ret ; ; ZVE32F-LABEL: vnsrl_16_half_two_source: @@ -810,16 +774,12 @@ define void @vnsrl_16_half_two_source(ptr %in0, ptr %in1, ptr %out) { ; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVE32F-NEXT: vle16.v v8, (a1) ; ZVE32F-NEXT: vle16.v v9, (a0) -; ZVE32F-NEXT: li a0, -1 ; ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; ZVE32F-NEXT: vslidedown.vi v10, v8, 2 -; ZVE32F-NEXT: vwaddu.vv v11, v8, v10 -; ZVE32F-NEXT: vwmaccu.vx v11, a0, v10 -; ZVE32F-NEXT: vmv.v.i v0, 12 -; ZVE32F-NEXT: vnsrl.wi v8, v9, 16 +; ZVE32F-NEXT: vnsrl.wi v8, v8, 16 +; ZVE32F-NEXT: vnsrl.wi v9, v9, 16 ; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZVE32F-NEXT: vmerge.vvm v8, v8, v11, v0 -; ZVE32F-NEXT: vse16.v v8, (a2) +; ZVE32F-NEXT: vslideup.vi v9, v8, 2 +; ZVE32F-NEXT: vse16.v v9, (a2) ; ZVE32F-NEXT: ret entry: %0 = load <4 x half>, ptr %in0, align 2 diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll index 9f0b2b3914836..8b41febced065 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll @@ -7,31 +7,24 @@ define {<16 x i1>, <16 x i1>} @vector_deinterleave_v16i1_v32i1(<32 x i1> %vec) { ; CHECK-LABEL: vector_deinterleave_v16i1_v32i1: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-NEXT: vmv1r.v v8, v0 -; CHECK-NEXT: vslidedown.vi v0, v0, 2 +; CHECK-NEXT: vslidedown.vi v8, v0, 2 ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vmv.v.i v10, 0 -; CHECK-NEXT: vid.v v9 -; CHECK-NEXT: li a0, -256 -; CHECK-NEXT: vmerge.vim v11, v10, 1, v0 -; CHECK-NEXT: vadd.vv v12, v9, v9 -; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vmv.s.x v9, a0 +; CHECK-NEXT: vmerge.vim v10, v9, 1, v0 ; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma -; CHECK-NEXT: vmerge.vim v8, v10, 1, v0 -; CHECK-NEXT: vadd.vi v10, v12, -16 -; CHECK-NEXT: vadd.vi v12, v12, -15 +; CHECK-NEXT: vmerge.vim v8, v9, 1, v0 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vnsrl.wi v13, v8, 0 +; CHECK-NEXT: vnsrl.wi v9, v10, 0 +; CHECK-NEXT: vnsrl.wi v11, v8, 0 +; CHECK-NEXT: vnsrl.wi v10, v10, 8 ; CHECK-NEXT: vnsrl.wi v8, v8, 8 -; CHECK-NEXT: vmv1r.v v0, v9 -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu -; CHECK-NEXT: vrgather.vv v13, v11, v10, v0.t -; CHECK-NEXT: vrgather.vv v8, v11, v12, v0.t -; CHECK-NEXT: vmsne.vi v0, v13, 0 -; CHECK-NEXT: vmsne.vi v8, v8, 0 +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-NEXT: vslideup.vi v9, v11, 8 +; CHECK-NEXT: vslideup.vi v10, v8, 8 +; CHECK-NEXT: vmsne.vi v0, v9, 0 +; CHECK-NEXT: vmsne.vi v8, v10, 0 ; CHECK-NEXT: ret %retval = call {<16 x i1>, <16 x i1>} @llvm.vector.deinterleave2.v32i1(<32 x i1> %vec) ret {<16 x i1>, <16 x i1>} %retval