diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 976b2478b433e..9c2563729cad7 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -1527,7 +1527,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, ISD::BUILD_VECTOR, ISD::CONCAT_VECTORS, ISD::EXPERIMENTAL_VP_REVERSE, ISD::MUL, ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM, - ISD::INSERT_VECTOR_ELT, ISD::ABS}); + ISD::INSERT_VECTOR_ELT, ISD::ABS, ISD::CTPOP}); if (Subtarget.hasVendorXTHeadMemPair()) setTargetDAGCombine({ISD::LOAD, ISD::STORE}); if (Subtarget.useRVVForFixedLengthVectors()) @@ -17055,6 +17055,52 @@ static SDValue combineTruncToVnclip(SDNode *N, SelectionDAG &DAG, return Val; } +// Convert +// (iX ctpop (bitcast (vXi1 A))) +// -> +// (zext (vcpop.m (nxvYi1 (insert_subvec (vXi1 A))))) +// FIXME: It's complicated to match all the variations of this after type +// legalization so we only handle the pre-type legalization pattern, but that +// requires the fixed vector type to be legal. +static SDValue combineScalarCTPOPToVCPOP(SDNode *N, SelectionDAG &DAG, + const RISCVSubtarget &Subtarget) { + EVT VT = N->getValueType(0); + if (!VT.isScalarInteger()) + return SDValue(); + + SDValue Src = N->getOperand(0); + + // Peek through zero_extend. It doesn't change the count. + if (Src.getOpcode() == ISD::ZERO_EXTEND) + Src = Src.getOperand(0); + + if (Src.getOpcode() != ISD::BITCAST) + return SDValue(); + + Src = Src.getOperand(0); + EVT SrcEVT = Src.getValueType(); + if (!SrcEVT.isSimple()) + return SDValue(); + + MVT SrcMVT = SrcEVT.getSimpleVT(); + // Make sure the input is an i1 vector. + if (!SrcMVT.isVector() || SrcMVT.getVectorElementType() != MVT::i1) + return SDValue(); + + if (!useRVVForFixedLengthVectorVT(SrcMVT, Subtarget)) + return SDValue(); + + MVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcMVT, Subtarget); + Src = convertToScalableVector(ContainerVT, Src, DAG, Subtarget); + + SDLoc DL(N); + auto [Mask, VL] = getDefaultVLOps(SrcMVT, ContainerVT, DL, DAG, Subtarget); + + MVT XLenVT = Subtarget.getXLenVT(); + SDValue Pop = DAG.getNode(RISCVISD::VCPOP_VL, DL, XLenVT, Src, Mask, VL); + return DAG.getZExtOrTrunc(Pop, DL, VT); +} + SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -18023,6 +18069,10 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, return SDValue(); } + case ISD::CTPOP: + if (SDValue V = combineScalarCTPOPToVCPOP(N, DAG, Subtarget)) + return V; + break; } return SDValue(); diff --git a/llvm/test/CodeGen/RISCV/rvv/combine-ctpop-to-vcpop.ll b/llvm/test/CodeGen/RISCV/rvv/combine-ctpop-to-vcpop.ll new file mode 100644 index 0000000000000..e5a8a25db4d41 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/combine-ctpop-to-vcpop.ll @@ -0,0 +1,265 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zbb | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zbb | FileCheck %s --check-prefixes=CHECK,RV64 + +define i2 @test_v2i1(<2 x i1> %x) { +; CHECK-LABEL: test_v2i1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; CHECK-NEXT: vcpop.m a0, v0 +; CHECK-NEXT: ret +entry: + %a = bitcast <2 x i1> %x to i2 + %b = call i2 @llvm.ctpop.i2(i2 %a) + ret i2 %b +} + +define i4 @test_v4i1(<4 x i1> %x) { +; CHECK-LABEL: test_v4i1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-NEXT: vcpop.m a0, v0 +; CHECK-NEXT: ret +entry: + %a = bitcast <4 x i1> %x to i4 + %b = call i4 @llvm.ctpop.i4(i4 %a) + ret i4 %b +} + +define i8 @test_v8i1(<8 x i1> %x) { +; CHECK-LABEL: test_v8i1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vcpop.m a0, v0 +; CHECK-NEXT: ret +entry: + %a = bitcast <8 x i1> %x to i8 + %b = call i8 @llvm.ctpop.i8(i8 %a) + ret i8 %b +} + +define i16 @test_v16i1(<16 x i1> %x) { +; CHECK-LABEL: test_v16i1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-NEXT: vcpop.m a0, v0 +; CHECK-NEXT: ret +entry: + %a = bitcast <16 x i1> %x to i16 + %b = call i16 @llvm.ctpop.i16(i16 %a) + ret i16 %b +} + +define i32 @test_v32i1(<32 x i1> %x) { +; CHECK-LABEL: test_v32i1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; CHECK-NEXT: vcpop.m a0, v0 +; CHECK-NEXT: ret +entry: + %a = bitcast <32 x i1> %x to i32 + %b = call i32 @llvm.ctpop.i32(i32 %a) + ret i32 %b +} + +define i64 @test_v64i1(<64 x i1> %x) { +; RV32-LABEL: test_v64i1: +; RV32: # %bb.0: # %entry +; RV32-NEXT: li a0, 64 +; RV32-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; RV32-NEXT: vcpop.m a0, v0 +; RV32-NEXT: li a1, 0 +; RV32-NEXT: ret +; +; RV64-LABEL: test_v64i1: +; RV64: # %bb.0: # %entry +; RV64-NEXT: li a0, 64 +; RV64-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; RV64-NEXT: vcpop.m a0, v0 +; RV64-NEXT: ret +entry: + %a = bitcast <64 x i1> %x to i64 + %b = call i64 @llvm.ctpop.i64(i64 %a) + ret i64 %b +} + +define i128 @test_v128i1(<128 x i1> %x) { +; RV32-LABEL: test_v128i1: +; RV32: # %bb.0: # %entry +; RV32-NEXT: li a1, 128 +; RV32-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; RV32-NEXT: vcpop.m a1, v0 +; RV32-NEXT: sw a1, 0(a0) +; RV32-NEXT: sw zero, 4(a0) +; RV32-NEXT: sw zero, 8(a0) +; RV32-NEXT: sw zero, 12(a0) +; RV32-NEXT: ret +; +; RV64-LABEL: test_v128i1: +; RV64: # %bb.0: # %entry +; RV64-NEXT: li a0, 128 +; RV64-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; RV64-NEXT: vcpop.m a0, v0 +; RV64-NEXT: li a1, 0 +; RV64-NEXT: ret +entry: + %a = bitcast <128 x i1> %x to i128 + %b = call i128 @llvm.ctpop.i128(i128 %a) + ret i128 %b +} + +define i32 @test_trunc_v128i1(<128 x i1> %x) { +; CHECK-LABEL: test_trunc_v128i1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li a0, 128 +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; CHECK-NEXT: vcpop.m a0, v0 +; CHECK-NEXT: ret +entry: + %a = bitcast <128 x i1> %x to i128 + %b = call i128 @llvm.ctpop.i128(i128 %a) + %c = trunc i128 %b to i32 + ret i32 %c +} + +define i256 @test_v256i1(<256 x i1> %x) { +; RV32-LABEL: test_v256i1: +; RV32: # %bb.0: # %entry +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV32-NEXT: vslidedown.vi v9, v0, 1 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vslidedown.vi v10, v8, 1 +; RV32-NEXT: vmv.x.s a2, v0 +; RV32-NEXT: vmv.x.s a3, v8 +; RV32-NEXT: vsrl.vx v11, v9, a1 +; RV32-NEXT: vsrl.vx v12, v0, a1 +; RV32-NEXT: vmv.x.s a4, v9 +; RV32-NEXT: vsrl.vx v9, v10, a1 +; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vmv.x.s a1, v10 +; RV32-NEXT: cpop a3, a3 +; RV32-NEXT: cpop a2, a2 +; RV32-NEXT: vmv.x.s a5, v11 +; RV32-NEXT: vmv.x.s a6, v12 +; RV32-NEXT: vmv.x.s a7, v9 +; RV32-NEXT: vmv.x.s t0, v8 +; RV32-NEXT: cpop a1, a1 +; RV32-NEXT: cpop a4, a4 +; RV32-NEXT: cpop t0, t0 +; RV32-NEXT: cpop a7, a7 +; RV32-NEXT: cpop a6, a6 +; RV32-NEXT: cpop a5, a5 +; RV32-NEXT: add a3, a3, t0 +; RV32-NEXT: add a1, a1, a7 +; RV32-NEXT: add a2, a2, a6 +; RV32-NEXT: add a4, a4, a5 +; RV32-NEXT: add a5, a3, a1 +; RV32-NEXT: add a6, a2, a4 +; RV32-NEXT: add a1, a6, a5 +; RV32-NEXT: sltu a3, a5, a3 +; RV32-NEXT: sltu a4, a6, a2 +; RV32-NEXT: sltu a2, a1, a6 +; RV32-NEXT: add a3, a4, a3 +; RV32-NEXT: add a3, a3, a2 +; RV32-NEXT: beq a3, a4, .LBB8_2 +; RV32-NEXT: # %bb.1: # %entry +; RV32-NEXT: sltu a2, a3, a4 +; RV32-NEXT: .LBB8_2: # %entry +; RV32-NEXT: sw zero, 16(a0) +; RV32-NEXT: sw zero, 20(a0) +; RV32-NEXT: sw zero, 24(a0) +; RV32-NEXT: sw zero, 28(a0) +; RV32-NEXT: sw a1, 0(a0) +; RV32-NEXT: sw a3, 4(a0) +; RV32-NEXT: sw a2, 8(a0) +; RV32-NEXT: sw zero, 12(a0) +; RV32-NEXT: ret +; +; RV64-LABEL: test_v256i1: +; RV64: # %bb.0: # %entry +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vslidedown.vi v9, v0, 1 +; RV64-NEXT: vmv.x.s a1, v0 +; RV64-NEXT: vslidedown.vi v10, v8, 1 +; RV64-NEXT: vmv.x.s a2, v8 +; RV64-NEXT: vmv.x.s a3, v9 +; RV64-NEXT: vmv.x.s a4, v10 +; RV64-NEXT: cpop a2, a2 +; RV64-NEXT: cpop a1, a1 +; RV64-NEXT: cpop a4, a4 +; RV64-NEXT: cpop a3, a3 +; RV64-NEXT: add a2, a2, a4 +; RV64-NEXT: add a1, a1, a3 +; RV64-NEXT: add a2, a1, a2 +; RV64-NEXT: sltu a1, a2, a1 +; RV64-NEXT: sd a2, 0(a0) +; RV64-NEXT: sd a1, 8(a0) +; RV64-NEXT: sd zero, 16(a0) +; RV64-NEXT: sd zero, 24(a0) +; RV64-NEXT: ret +entry: + %a = bitcast <256 x i1> %x to i256 + %b = call i256 @llvm.ctpop.i256(i256 %a) + ret i256 %b +} + +define i32 @test_trunc_v256i1(<256 x i1> %x) { +; RV32-LABEL: test_trunc_v256i1: +; RV32: # %bb.0: # %entry +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV32-NEXT: vslidedown.vi v9, v0, 1 +; RV32-NEXT: li a0, 32 +; RV32-NEXT: vslidedown.vi v10, v8, 1 +; RV32-NEXT: vmv.x.s a1, v0 +; RV32-NEXT: vmv.x.s a2, v8 +; RV32-NEXT: vsrl.vx v11, v9, a0 +; RV32-NEXT: vsrl.vx v12, v0, a0 +; RV32-NEXT: vmv.x.s a3, v9 +; RV32-NEXT: vsrl.vx v9, v10, a0 +; RV32-NEXT: vsrl.vx v8, v8, a0 +; RV32-NEXT: vmv.x.s a0, v10 +; RV32-NEXT: cpop a2, a2 +; RV32-NEXT: cpop a1, a1 +; RV32-NEXT: vmv.x.s a4, v11 +; RV32-NEXT: vmv.x.s a5, v12 +; RV32-NEXT: vmv.x.s a6, v9 +; RV32-NEXT: vmv.x.s a7, v8 +; RV32-NEXT: cpop a0, a0 +; RV32-NEXT: cpop a3, a3 +; RV32-NEXT: cpop a7, a7 +; RV32-NEXT: cpop a6, a6 +; RV32-NEXT: cpop a5, a5 +; RV32-NEXT: cpop a4, a4 +; RV32-NEXT: add a2, a2, a7 +; RV32-NEXT: add a0, a0, a6 +; RV32-NEXT: add a1, a1, a5 +; RV32-NEXT: add a3, a3, a4 +; RV32-NEXT: add a0, a2, a0 +; RV32-NEXT: add a1, a1, a3 +; RV32-NEXT: add a0, a1, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: test_trunc_v256i1: +; RV64: # %bb.0: # %entry +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vslidedown.vi v9, v0, 1 +; RV64-NEXT: vmv.x.s a0, v0 +; RV64-NEXT: vslidedown.vi v10, v8, 1 +; RV64-NEXT: vmv.x.s a1, v8 +; RV64-NEXT: vmv.x.s a2, v9 +; RV64-NEXT: vmv.x.s a3, v10 +; RV64-NEXT: cpop a1, a1 +; RV64-NEXT: cpop a0, a0 +; RV64-NEXT: cpop a3, a3 +; RV64-NEXT: cpop a2, a2 +; RV64-NEXT: add a1, a1, a3 +; RV64-NEXT: add a0, a0, a2 +; RV64-NEXT: add a0, a0, a1 +; RV64-NEXT: ret +entry: + %a = bitcast <256 x i1> %x to i256 + %b = call i256 @llvm.ctpop.i256(i256 %a) + %c = trunc i256 %b to i32 + ret i32 %c +} diff --git a/llvm/test/CodeGen/RISCV/rvv/compressstore.ll b/llvm/test/CodeGen/RISCV/rvv/compressstore.ll index 400dfd393509c..bfb2d0a3accc4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/compressstore.ll +++ b/llvm/test/CodeGen/RISCV/rvv/compressstore.ll @@ -453,20 +453,17 @@ define void @test_compresstore_v128i16(ptr %p, <128 x i1> %mask, <128 x i16> %da ; RV64-NEXT: vsetvli zero, a1, e16, m8, ta, ma ; RV64-NEXT: vcompress.vm v24, v8, v0 ; RV64-NEXT: vcpop.m a2, v0 -; RV64-NEXT: vsetvli zero, a2, e16, m8, ta, ma -; RV64-NEXT: vse16.v v24, (a0) ; RV64-NEXT: vsetivli zero, 8, e8, m1, ta, ma ; RV64-NEXT: vslidedown.vi v8, v0, 8 -; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; RV64-NEXT: vmv.x.s a2, v0 ; RV64-NEXT: vsetvli zero, a1, e16, m8, ta, ma -; RV64-NEXT: vcompress.vm v24, v16, v8 +; RV64-NEXT: vcompress.vm v0, v16, v8 ; RV64-NEXT: vcpop.m a1, v8 -; RV64-NEXT: cpop a2, a2 +; RV64-NEXT: vsetvli zero, a2, e16, m8, ta, ma +; RV64-NEXT: vse16.v v24, (a0) ; RV64-NEXT: slli a2, a2, 1 ; RV64-NEXT: add a0, a0, a2 ; RV64-NEXT: vsetvli zero, a1, e16, m8, ta, ma -; RV64-NEXT: vse16.v v24, (a0) +; RV64-NEXT: vse16.v v0, (a0) ; RV64-NEXT: ret ; ; RV32-LABEL: test_compresstore_v128i16: @@ -673,20 +670,17 @@ define void @test_compresstore_v64i32(ptr %p, <64 x i1> %mask, <64 x i32> %data) ; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; RV32-NEXT: vcompress.vm v24, v8, v0 ; RV32-NEXT: vcpop.m a2, v0 -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vse32.v v24, (a0) ; RV32-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV32-NEXT: vslidedown.vi v8, v0, 4 -; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV32-NEXT: vmv.x.s a2, v0 ; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vcompress.vm v24, v16, v8 +; RV32-NEXT: vcompress.vm v0, v16, v8 ; RV32-NEXT: vcpop.m a1, v8 -; RV32-NEXT: cpop a2, a2 +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; RV32-NEXT: vse32.v v24, (a0) ; RV32-NEXT: slli a2, a2, 2 ; RV32-NEXT: add a0, a0, a2 ; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vse32.v v24, (a0) +; RV32-NEXT: vse32.v v0, (a0) ; RV32-NEXT: ret entry: tail call void @llvm.masked.compressstore.v64i32(<64 x i32> %data, ptr align 4 %p, <64 x i1> %mask) diff --git a/llvm/test/CodeGen/RISCV/rvv/expandload.ll b/llvm/test/CodeGen/RISCV/rvv/expandload.ll index f1fcaed2762ae..b32d85bb1943a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/expandload.ll +++ b/llvm/test/CodeGen/RISCV/rvv/expandload.ll @@ -749,60 +749,66 @@ define <128 x i16> @test_expandload_v128i16(ptr %base, <128 x i1> %mask, <128 x ; CHECK-RV64-NEXT: sub sp, sp, a1 ; CHECK-RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb ; CHECK-RV64-NEXT: csrr a1, vlenb -; CHECK-RV64-NEXT: slli a1, a1, 4 +; CHECK-RV64-NEXT: slli a1, a1, 3 ; CHECK-RV64-NEXT: add a1, sp, a1 ; CHECK-RV64-NEXT: addi a1, a1, 16 ; CHECK-RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-RV64-NEXT: li a1, 64 ; CHECK-RV64-NEXT: vsetivli zero, 8, e8, m1, ta, ma ; CHECK-RV64-NEXT: vslidedown.vi v7, v0, 8 -; CHECK-RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-RV64-NEXT: vmv.x.s a2, v0 -; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, ta, ma -; CHECK-RV64-NEXT: vcpop.m a3, v0 -; CHECK-RV64-NEXT: vsetvli zero, a3, e16, m8, ta, ma -; CHECK-RV64-NEXT: vle16.v v24, (a0) -; CHECK-RV64-NEXT: csrr a3, vlenb -; CHECK-RV64-NEXT: li a4, 24 -; CHECK-RV64-NEXT: mul a3, a3, a4 -; CHECK-RV64-NEXT: add a3, sp, a3 -; CHECK-RV64-NEXT: addi a3, a3, 16 -; CHECK-RV64-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill ; CHECK-RV64-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-RV64-NEXT: vcpop.m a2, v0 ; CHECK-RV64-NEXT: vcpop.m a3, v7 -; CHECK-RV64-NEXT: cpop a2, a2 +; CHECK-RV64-NEXT: vsetvli zero, a2, e16, m8, ta, ma +; CHECK-RV64-NEXT: vle16.v v24, (a0) +; CHECK-RV64-NEXT: csrr a4, vlenb +; CHECK-RV64-NEXT: slli a4, a4, 4 +; CHECK-RV64-NEXT: add a4, sp, a4 +; CHECK-RV64-NEXT: addi a4, a4, 16 +; CHECK-RV64-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill ; CHECK-RV64-NEXT: slli a2, a2, 1 ; CHECK-RV64-NEXT: add a0, a0, a2 ; CHECK-RV64-NEXT: vsetvli zero, a3, e16, m8, ta, ma -; CHECK-RV64-NEXT: vle16.v v16, (a0) +; CHECK-RV64-NEXT: vle16.v v24, (a0) ; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: slli a0, a0, 3 +; CHECK-RV64-NEXT: li a2, 24 +; CHECK-RV64-NEXT: mul a0, a0, a2 ; CHECK-RV64-NEXT: add a0, sp, a0 ; CHECK-RV64-NEXT: addi a0, a0, 16 -; CHECK-RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill ; CHECK-RV64-NEXT: vsetvli zero, a1, e16, m8, ta, mu ; CHECK-RV64-NEXT: viota.m v24, v0 ; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: li a1, 24 -; CHECK-RV64-NEXT: mul a0, a0, a1 +; CHECK-RV64-NEXT: slli a0, a0, 4 ; CHECK-RV64-NEXT: add a0, sp, a0 ; CHECK-RV64-NEXT: addi a0, a0, 16 ; CHECK-RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-RV64-NEXT: vrgather.vv v8, v16, v24, v0.t ; CHECK-RV64-NEXT: addi a0, sp, 16 ; CHECK-RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-RV64-NEXT: viota.m v8, v7 -; CHECK-RV64-NEXT: vmv1r.v v0, v7 +; CHECK-RV64-NEXT: viota.m v16, v7 ; CHECK-RV64-NEXT: csrr a0, vlenb ; CHECK-RV64-NEXT: slli a0, a0, 4 ; CHECK-RV64-NEXT: add a0, sp, a0 ; CHECK-RV64-NEXT: addi a0, a0, 16 -; CHECK-RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-RV64-NEXT: vmv1r.v v0, v7 ; CHECK-RV64-NEXT: csrr a0, vlenb -; CHECK-RV64-NEXT: slli a0, a0, 3 +; CHECK-RV64-NEXT: li a1, 24 +; CHECK-RV64-NEXT: mul a0, a0, a1 ; CHECK-RV64-NEXT: add a0, sp, a0 ; CHECK-RV64-NEXT: addi a0, a0, 16 ; CHECK-RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-RV64-NEXT: csrr a0, vlenb +; CHECK-RV64-NEXT: slli a0, a0, 3 +; CHECK-RV64-NEXT: add a0, sp, a0 +; CHECK-RV64-NEXT: addi a0, a0, 16 +; CHECK-RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-RV64-NEXT: csrr a0, vlenb +; CHECK-RV64-NEXT: slli a0, a0, 4 +; CHECK-RV64-NEXT: add a0, sp, a0 +; CHECK-RV64-NEXT: addi a0, a0, 16 +; CHECK-RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-RV64-NEXT: vrgather.vv v16, v24, v8, v0.t ; CHECK-RV64-NEXT: addi a0, sp, 16 ; CHECK-RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload @@ -844,12 +850,9 @@ define <128 x i16> @test_expandload_v128i16_all_ones(ptr %base, <128 x i16> %pas ; CHECK-RV64-NEXT: vsetvli zero, a1, e16, m8, ta, ma ; CHECK-RV64-NEXT: vle16.v v8, (a0) ; CHECK-RV64-NEXT: vmset.m v16 -; CHECK-RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-RV64-NEXT: vmv.x.s a2, v16 -; CHECK-RV64-NEXT: cpop a2, a2 -; CHECK-RV64-NEXT: slli a2, a2, 1 -; CHECK-RV64-NEXT: add a0, a0, a2 -; CHECK-RV64-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-RV64-NEXT: vcpop.m a1, v16 +; CHECK-RV64-NEXT: slli a1, a1, 1 +; CHECK-RV64-NEXT: add a0, a0, a1 ; CHECK-RV64-NEXT: vle16.v v16, (a0) ; CHECK-RV64-NEXT: ret %res = call <128 x i16> @llvm.masked.expandload.v128i16(ptr align 2 %base, <128 x i1> splat (i1 true), <128 x i16> %passthru) @@ -1020,60 +1023,66 @@ define <64 x i32> @test_expandload_v64i32(ptr %base, <64 x i1> %mask, <64 x i32> ; CHECK-RV32-NEXT: sub sp, sp, a1 ; CHECK-RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb ; CHECK-RV32-NEXT: csrr a1, vlenb -; CHECK-RV32-NEXT: slli a1, a1, 4 +; CHECK-RV32-NEXT: slli a1, a1, 3 ; CHECK-RV32-NEXT: add a1, sp, a1 ; CHECK-RV32-NEXT: addi a1, a1, 16 ; CHECK-RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-RV32-NEXT: li a1, 32 ; CHECK-RV32-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; CHECK-RV32-NEXT: vslidedown.vi v7, v0, 4 -; CHECK-RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-RV32-NEXT: vmv.x.s a2, v0 -; CHECK-RV32-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-RV32-NEXT: vcpop.m a3, v0 -; CHECK-RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; CHECK-RV32-NEXT: vle32.v v24, (a0) -; CHECK-RV32-NEXT: csrr a3, vlenb -; CHECK-RV32-NEXT: li a4, 24 -; CHECK-RV32-NEXT: mul a3, a3, a4 -; CHECK-RV32-NEXT: add a3, sp, a3 -; CHECK-RV32-NEXT: addi a3, a3, 16 -; CHECK-RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill ; CHECK-RV32-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-RV32-NEXT: vcpop.m a2, v0 ; CHECK-RV32-NEXT: vcpop.m a3, v7 -; CHECK-RV32-NEXT: cpop a2, a2 +; CHECK-RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; CHECK-RV32-NEXT: vle32.v v24, (a0) +; CHECK-RV32-NEXT: csrr a4, vlenb +; CHECK-RV32-NEXT: slli a4, a4, 4 +; CHECK-RV32-NEXT: add a4, sp, a4 +; CHECK-RV32-NEXT: addi a4, a4, 16 +; CHECK-RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill ; CHECK-RV32-NEXT: slli a2, a2, 2 ; CHECK-RV32-NEXT: add a0, a0, a2 ; CHECK-RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; CHECK-RV32-NEXT: vle32.v v16, (a0) +; CHECK-RV32-NEXT: vle32.v v24, (a0) ; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 3 +; CHECK-RV32-NEXT: li a2, 24 +; CHECK-RV32-NEXT: mul a0, a0, a2 ; CHECK-RV32-NEXT: add a0, sp, a0 ; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill ; CHECK-RV32-NEXT: vsetvli zero, a1, e32, m8, ta, mu ; CHECK-RV32-NEXT: viota.m v24, v0 ; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: li a1, 24 -; CHECK-RV32-NEXT: mul a0, a0, a1 +; CHECK-RV32-NEXT: slli a0, a0, 4 ; CHECK-RV32-NEXT: add a0, sp, a0 ; CHECK-RV32-NEXT: addi a0, a0, 16 ; CHECK-RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-RV32-NEXT: vrgather.vv v8, v16, v24, v0.t ; CHECK-RV32-NEXT: addi a0, sp, 16 ; CHECK-RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-RV32-NEXT: viota.m v8, v7 -; CHECK-RV32-NEXT: vmv1r.v v0, v7 +; CHECK-RV32-NEXT: viota.m v16, v7 ; CHECK-RV32-NEXT: csrr a0, vlenb ; CHECK-RV32-NEXT: slli a0, a0, 4 ; CHECK-RV32-NEXT: add a0, sp, a0 ; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-RV32-NEXT: vmv1r.v v0, v7 ; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 3 +; CHECK-RV32-NEXT: li a1, 24 +; CHECK-RV32-NEXT: mul a0, a0, a1 ; CHECK-RV32-NEXT: add a0, sp, a0 ; CHECK-RV32-NEXT: addi a0, a0, 16 ; CHECK-RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-RV32-NEXT: csrr a0, vlenb +; CHECK-RV32-NEXT: slli a0, a0, 3 +; CHECK-RV32-NEXT: add a0, sp, a0 +; CHECK-RV32-NEXT: addi a0, a0, 16 +; CHECK-RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-RV32-NEXT: csrr a0, vlenb +; CHECK-RV32-NEXT: slli a0, a0, 4 +; CHECK-RV32-NEXT: add a0, sp, a0 +; CHECK-RV32-NEXT: addi a0, a0, 16 +; CHECK-RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-RV32-NEXT: vrgather.vv v16, v24, v8, v0.t ; CHECK-RV32-NEXT: addi a0, sp, 16 ; CHECK-RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload @@ -1169,8 +1178,7 @@ define <64 x i32> @test_expandload_v64i32_all_ones(ptr %base, <64 x i32> %passth ; CHECK-RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; CHECK-RV32-NEXT: vle32.v v8, (a0) ; CHECK-RV32-NEXT: vmset.m v16 -; CHECK-RV32-NEXT: vmv.x.s a1, v16 -; CHECK-RV32-NEXT: cpop a1, a1 +; CHECK-RV32-NEXT: vcpop.m a1, v16 ; CHECK-RV32-NEXT: slli a1, a1, 2 ; CHECK-RV32-NEXT: add a0, a0, a1 ; CHECK-RV32-NEXT: vle32.v v16, (a0)