diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h index a903eaa6cbe54..5d41d1cd14ef4 100644 --- a/llvm/include/llvm/Analysis/VectorUtils.h +++ b/llvm/include/llvm/Analysis/VectorUtils.h @@ -270,7 +270,8 @@ void processShuffleMasks( ArrayRef Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref NoInputAction, function_ref, unsigned, unsigned)> SingleInputAction, - function_ref, unsigned, unsigned)> ManyInputsAction); + function_ref, unsigned, unsigned, bool)> + ManyInputsAction); /// Compute the demanded elements mask of horizontal binary operations. A /// horizontal operation combines two adjacent elements in a vector operand. diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp index b4b311cb727a1..ad80e458ab57d 100644 --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -557,7 +557,8 @@ void llvm::processShuffleMasks( ArrayRef Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref NoInputAction, function_ref, unsigned, unsigned)> SingleInputAction, - function_ref, unsigned, unsigned)> ManyInputsAction) { + function_ref, unsigned, unsigned, bool)> + ManyInputsAction) { SmallVector>> Res(NumOfDestRegs); // Try to perform better estimation of the permutation. // 1. Split the source/destination vectors into real registers. @@ -628,6 +629,7 @@ void llvm::processShuffleMasks( } }; int SecondIdx; + bool NewReg = true; do { int FirstIdx = -1; SecondIdx = -1; @@ -645,7 +647,8 @@ void llvm::processShuffleMasks( SecondIdx = I; SecondMask = RegMask; CombineMasks(FirstMask, SecondMask); - ManyInputsAction(FirstMask, FirstIdx, SecondIdx); + ManyInputsAction(FirstMask, FirstIdx, SecondIdx, NewReg); + NewReg = false; NormalizeMask(FirstMask); RegMask.clear(); SecondMask = FirstMask; @@ -653,7 +656,8 @@ void llvm::processShuffleMasks( } if (FirstIdx != SecondIdx && SecondIdx >= 0) { CombineMasks(SecondMask, FirstMask); - ManyInputsAction(SecondMask, SecondIdx, FirstIdx); + ManyInputsAction(SecondMask, SecondIdx, FirstIdx, NewReg); + NewReg = false; Dest[FirstIdx].clear(); NormalizeMask(SecondMask); } diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 5117eb8d91dfb..f39d9ca15496a 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -3059,8 +3059,8 @@ void DAGTypeLegalizer::SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N, Inputs[Idx] = Output; }, [&AccumulateResults, &Output, &DAG = DAG, NewVT, &DL, &Inputs, - &TmpInputs, - &BuildVector](ArrayRef Mask, unsigned Idx1, unsigned Idx2) { + &TmpInputs, &BuildVector](ArrayRef Mask, unsigned Idx1, + unsigned Idx2, bool /*Unused*/) { if (AccumulateResults(Idx1)) { if (Inputs[Idx1]->getOpcode() == ISD::BUILD_VECTOR && Inputs[Idx2]->getOpcode() == ISD::BUILD_VECTOR) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 95f1deed8b6c0..b25cb128bce9f 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -5104,7 +5104,6 @@ static SDValue lowerShuffleViaVRegSplitting(ShuffleVectorSDNode *SVN, SDValue V1 = SVN->getOperand(0); SDValue V2 = SVN->getOperand(1); ArrayRef Mask = SVN->getMask(); - unsigned NumElts = VT.getVectorNumElements(); // If we don't know exact data layout, not much we can do. If this // is already m1 or smaller, no point in splitting further. @@ -5121,58 +5120,102 @@ static SDValue lowerShuffleViaVRegSplitting(ShuffleVectorSDNode *SVN, MVT ElemVT = VT.getVectorElementType(); unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits(); - unsigned VRegsPerSrc = NumElts / ElemsPerVReg; - - SmallVector>> - OutMasks(VRegsPerSrc, {-1, {}}); - - // Check if our mask can be done as a 1-to-1 mapping from source - // to destination registers in the group without needing to - // write each destination more than once. - for (unsigned DstIdx = 0; DstIdx < Mask.size(); DstIdx++) { - int DstVecIdx = DstIdx / ElemsPerVReg; - int DstSubIdx = DstIdx % ElemsPerVReg; - int SrcIdx = Mask[DstIdx]; - if (SrcIdx < 0 || (unsigned)SrcIdx >= 2 * NumElts) - continue; - int SrcVecIdx = SrcIdx / ElemsPerVReg; - int SrcSubIdx = SrcIdx % ElemsPerVReg; - if (OutMasks[DstVecIdx].first == -1) - OutMasks[DstVecIdx].first = SrcVecIdx; - if (OutMasks[DstVecIdx].first != SrcVecIdx) - // Note: This case could easily be handled by keeping track of a chain - // of source values and generating two element shuffles below. This is - // less an implementation question, and more a profitability one. - return SDValue(); - - OutMasks[DstVecIdx].second.resize(ElemsPerVReg, -1); - OutMasks[DstVecIdx].second[DstSubIdx] = SrcSubIdx; - } EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget); MVT OneRegVT = MVT::getVectorVT(ElemVT, ElemsPerVReg); MVT M1VT = getContainerForFixedLengthVector(DAG, OneRegVT, Subtarget); assert(M1VT == getLMUL1VT(M1VT)); unsigned NumOpElts = M1VT.getVectorMinNumElements(); - SDValue Vec = DAG.getUNDEF(ContainerVT); + unsigned NumElts = ContainerVT.getVectorMinNumElements(); + unsigned NumOfSrcRegs = NumElts / NumOpElts; + unsigned NumOfDestRegs = NumElts / NumOpElts; // The following semantically builds up a fixed length concat_vector // of the component shuffle_vectors. We eagerly lower to scalable here // to avoid DAG combining it back to a large shuffle_vector again. V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget); V2 = convertToScalableVector(ContainerVT, V2, DAG, Subtarget); - for (unsigned DstVecIdx = 0 ; DstVecIdx < OutMasks.size(); DstVecIdx++) { - auto &[SrcVecIdx, SrcSubMask] = OutMasks[DstVecIdx]; - if (SrcVecIdx == -1) - continue; - unsigned ExtractIdx = (SrcVecIdx % VRegsPerSrc) * NumOpElts; - SDValue SrcVec = (unsigned)SrcVecIdx >= VRegsPerSrc ? V2 : V1; + SmallVector>>> + Operands; + processShuffleMasks( + Mask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, + [&]() { Operands.emplace_back(); }, + [&](ArrayRef SrcSubMask, unsigned SrcVecIdx, unsigned DstVecIdx) { + Operands.emplace_back().emplace_back( + SrcVecIdx, UINT_MAX, + SmallVector(SrcSubMask.begin(), SrcSubMask.end())); + }, + [&](ArrayRef SrcSubMask, unsigned Idx1, unsigned Idx2, bool NewReg) { + if (NewReg) + Operands.emplace_back(); + Operands.back().emplace_back( + Idx1, Idx2, SmallVector(SrcSubMask.begin(), SrcSubMask.end())); + }); + assert(Operands.size() == NumOfDestRegs && "Whole vector must be processed"); + // Note: check that we do not emit too many shuffles here to prevent code + // size explosion. + // TODO: investigate, if it can be improved by extra analysis of the masks to + // check if the code is more profitable. + unsigned NumShuffles = std::accumulate( + Operands.begin(), Operands.end(), 0u, + [&](unsigned N, + ArrayRef>> Data) { + if (Data.empty()) + return N; + N += Data.size(); + for (const auto &P : Data) { + unsigned Idx2 = std::get<1>(P); + ArrayRef Mask = std::get<2>(P); + if (Idx2 != UINT_MAX) + ++N; + else if (ShuffleVectorInst::isIdentityMask(Mask, Mask.size())) + --N; + } + return N; + }); + if ((NumOfDestRegs > 2 && NumShuffles > NumOfDestRegs) || + (NumOfDestRegs <= 2 && NumShuffles >= 4)) + return SDValue(); + auto ExtractValue = [&, &DAG = DAG](SDValue SrcVec, unsigned ExtractIdx) { SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, M1VT, SrcVec, DAG.getVectorIdxConstant(ExtractIdx, DL)); SubVec = convertFromScalableVector(OneRegVT, SubVec, DAG, Subtarget); - SubVec = DAG.getVectorShuffle(OneRegVT, DL, SubVec, SubVec, SrcSubMask); - SubVec = convertToScalableVector(M1VT, SubVec, DAG, Subtarget); - unsigned InsertIdx = DstVecIdx * NumOpElts; - Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT, Vec, SubVec, + return SubVec; + }; + auto PerformShuffle = [&, &DAG = DAG](SDValue SubVec1, SDValue SubVec2, + ArrayRef Mask) { + SDValue SubVec = DAG.getVectorShuffle(OneRegVT, DL, SubVec1, SubVec2, Mask); + return SubVec; + }; + SDValue Vec = DAG.getUNDEF(ContainerVT); + for (auto [I, Data] : enumerate(Operands)) { + if (Data.empty()) + continue; + SmallDenseMap Values; + for (unsigned I : seq(Data.size())) { + const auto &[Idx1, Idx2, _] = Data[I]; + if (Values.contains(Idx1)) { + assert(Idx2 != UINT_MAX && Values.contains(Idx2) && + "Expected both indices to be extracted already."); + break; + } + SDValue V = ExtractValue(Idx1 >= NumOfSrcRegs ? V2 : V1, + (Idx1 % NumOfSrcRegs) * NumOpElts); + Values[Idx1] = V; + if (Idx2 != UINT_MAX) + Values[Idx2] = ExtractValue(Idx2 >= NumOfSrcRegs ? V2 : V1, + (Idx2 % NumOfSrcRegs) * NumOpElts); + } + SDValue V; + for (const auto &[Idx1, Idx2, Mask] : Data) { + SDValue V1 = Values.at(Idx1); + SDValue V2 = Idx2 == UINT_MAX ? V1 : Values.at(Idx2); + V = PerformShuffle(V1, V2, Mask); + Values[Idx1] = V; + } + + unsigned InsertIdx = I * NumOpElts; + V = convertToScalableVector(M1VT, V, DAG, Subtarget); + Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT, Vec, V, DAG.getVectorIdxConstant(InsertIdx, DL)); } return convertFromScalableVector(VT, Vec, DAG, Subtarget); diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index c19bcfc5524cc..413b54343ef0e 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -1774,9 +1774,9 @@ InstructionCost X86TTIImpl::getShuffleCost( PrevSrcReg = SrcReg; PrevRegMask = RegMask; }, - [this, SingleOpTy, CostKind, &Cost](ArrayRef RegMask, - unsigned /*Unused*/, - unsigned /*Unused*/) { + [this, SingleOpTy, CostKind, + &Cost](ArrayRef RegMask, unsigned /*Unused*/, + unsigned /*Unused*/, bool /*Unused*/) { Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask, CostKind, 0, nullptr); }); diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll index 4603c0d24f5d7..54d0acc3ba8b5 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll @@ -168,12 +168,11 @@ define <4 x i64> @m2_splat_into_slide_two_source_v2_lo(<4 x i64> %v1, <4 x i64> define <4 x i64> @m2_splat_into_slide_two_source(<4 x i64> %v1, <4 x i64> %v2) vscale_range(2,2) { ; CHECK-LABEL: m2_splat_into_slide_two_source: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; CHECK-NEXT: vmv.v.i v0, 12 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vslidedown.vi v13, v10, 1 +; CHECK-NEXT: vslideup.vi v13, v11, 1 ; CHECK-NEXT: vrgather.vi v12, v8, 0 -; CHECK-NEXT: vslideup.vi v12, v10, 1, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %res = shufflevector <4 x i64> %v1, <4 x i64> %v2, <4 x i32> ret <4 x i64> %res @@ -183,18 +182,17 @@ define void @shuffle1(ptr %explicit_0, ptr %explicit_1) vscale_range(2,2) { ; CHECK-LABEL: shuffle1: ; CHECK: # %bb.0: ; CHECK-NEXT: addi a0, a0, 252 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vid.v v8 +; CHECK-NEXT: vid.v v10 ; CHECK-NEXT: vsetivli zero, 3, e32, m1, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: li a0, 175 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vsrl.vi v8, v8, 1 -; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vadd.vi v8, v8, 1 -; CHECK-NEXT: vrgather.vv v11, v9, v8 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vmerge.vim v8, v10, 0, v0 +; CHECK-NEXT: vle32.v v11, (a0) +; CHECK-NEXT: vmv.v.i v0, 5 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vsrl.vi v10, v10, 1 +; CHECK-NEXT: vadd.vi v10, v10, 1 +; CHECK-NEXT: vrgather.vv v9, v11, v10, v0.t ; CHECK-NEXT: addi a0, a1, 672 ; CHECK-NEXT: vs2r.v v8, (a0) ; CHECK-NEXT: ret @@ -211,15 +209,15 @@ define void @shuffle1(ptr %explicit_0, ptr %explicit_1) vscale_range(2,2) { define <16 x float> @shuffle2(<4 x float> %a) vscale_range(2,2) { ; CHECK-LABEL: shuffle2: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vid.v v9 -; CHECK-NEXT: li a0, -97 -; CHECK-NEXT: vadd.vv v9, v9, v9 -; CHECK-NEXT: vrsub.vi v9, v9, 4 -; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vrgather.vv v13, v8, v9 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vmerge.vim v8, v12, 0, v0 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vid.v v13 +; CHECK-NEXT: vadd.vv v13, v13, v13 +; CHECK-NEXT: vmv.v.i v0, 6 +; CHECK-NEXT: vrsub.vi v13, v13, 4 +; CHECK-NEXT: vrgather.vv v9, v12, v13, v0.t ; CHECK-NEXT: ret %b = extractelement <4 x float> %a, i32 2 %c = insertelement <16 x float> , float %b, i32 5 @@ -231,16 +229,15 @@ define <16 x float> @shuffle2(<4 x float> %a) vscale_range(2,2) { define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) vscale_range(2,2) { ; RV32-LABEL: extract_any_extend_vector_inreg_v16i64: ; RV32: # %bb.0: -; RV32-NEXT: li a1, 16 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vmv.v.i v16, 0 -; RV32-NEXT: vmv.s.x v0, a1 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; RV32-NEXT: vmv.v.i v0, 1 ; RV32-NEXT: li a1, 32 -; RV32-NEXT: vrgather.vi v16, v8, 15, v0.t -; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32-NEXT: vrgather.vi v18, v15, 1, v0.t +; RV32-NEXT: vsetivli zero, 1, e64, m8, ta, ma ; RV32-NEXT: vslidedown.vx v8, v16, a0 ; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: vsetivli zero, 1, e64, m8, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret @@ -258,13 +255,14 @@ define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) vsca ; RV64-NEXT: addi s0, sp, 256 ; RV64-NEXT: .cfi_def_cfa s0, 0 ; RV64-NEXT: andi sp, sp, -128 -; RV64-NEXT: li a1, -17 +; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; RV64-NEXT: vmv.v.i v0, 1 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vmv.s.x v0, a1 -; RV64-NEXT: vrgather.vi v16, v8, 15 -; RV64-NEXT: vmerge.vim v8, v16, 0, v0 +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; RV64-NEXT: vrgather.vi v18, v15, 1, v0.t ; RV64-NEXT: mv s2, sp -; RV64-NEXT: vs8r.v v8, (s2) +; RV64-NEXT: vs8r.v v16, (s2) ; RV64-NEXT: andi a0, a0, 15 ; RV64-NEXT: li a1, 8 ; RV64-NEXT: call __muldi3 @@ -290,21 +288,16 @@ define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) vsca define <4 x double> @shuffles_add(<4 x double> %0, <4 x double> %1) vscale_range(2,2) { ; CHECK-LABEL: shuffles_add: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; CHECK-NEXT: vmv1r.v v13, v10 +; CHECK-NEXT: vslideup.vi v13, v11, 1 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: vmv.v.i v0, 1 +; CHECK-NEXT: vrgather.vi v12, v9, 0 +; CHECK-NEXT: vmv1r.v v9, v11 +; CHECK-NEXT: vrgather.vi v9, v10, 1, v0.t ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vrgather.vi v12, v8, 2 -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-NEXT: vid.v v14 -; CHECK-NEXT: vmv.v.i v0, 12 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-NEXT: vrgather.vi v16, v8, 3 -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-NEXT: vadd.vv v8, v14, v14 -; CHECK-NEXT: vadd.vi v9, v8, -4 -; CHECK-NEXT: vadd.vi v8, v8, -3 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vrgatherei16.vv v12, v10, v9, v0.t -; CHECK-NEXT: vrgatherei16.vv v16, v10, v8, v0.t -; CHECK-NEXT: vfadd.vv v8, v12, v16 +; CHECK-NEXT: vfadd.vv v8, v12, v8 ; CHECK-NEXT: ret %3 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> %4 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> @@ -332,12 +325,13 @@ entry: define <16 x i32> @m4_linear_num_of_shuffles_in_chunks(<16 x i32> %0) vscale_range(2,2) { ; CHECK-LABEL: m4_linear_num_of_shuffles_in_chunks: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lui a0, %hi(.LCPI18_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI18_0) -; CHECK-NEXT: vl2re16.v v16, (a0) -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vrgatherei16.vv v12, v8, v16 -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 8 +; CHECK-NEXT: vrgather.vi v12, v10, 0 +; CHECK-NEXT: vrgather.vi v12, v11, 0, v0.t +; CHECK-NEXT: vrgather.vi v14, v8, 2 +; CHECK-NEXT: vrgather.vi v15, v10, 3 +; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret entry: %1 = shufflevector <16 x i32> %0, <16 x i32> poison, <16 x i32>