diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 1aeca7f370aa1..b632c50dae0e3 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -419,8 +419,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, } setOperationAction(ISD::VECTOR_SHUFFLE, - {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32}, - Expand); + {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32, + MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32}, + Custom); if (Subtarget->hasPkMovB32()) { // TODO: 16-bit element vectors should be legal with even aligned elements. @@ -7589,15 +7590,38 @@ static bool elementPairIsContiguous(ArrayRef Mask, int Elt) { return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0); } +static bool elementPairIsOddToEven(ArrayRef Mask, int Elt) { + assert(Elt % 2 == 0); + return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) && + !(Mask[Elt + 1] & 1); +} + SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); EVT ResultVT = Op.getValueType(); ShuffleVectorSDNode *SVN = cast(Op); MVT EltVT = ResultVT.getVectorElementType().getSimpleVT(); - MVT PackVT = MVT::getVectorVT(EltVT, 2); + const int NewSrcNumElts = 2; + MVT PackVT = MVT::getVectorVT(EltVT, NewSrcNumElts); int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements(); + // Break up the shuffle into registers sized pieces. + // + // We're trying to form sub-shuffles that the register allocation pipeline + // won't be able to figure out, like how to use v_pk_mov_b32 to do a register + // blend or 16-bit op_sel. It should be able to figure out how to reassemble a + // pair of copies into a consecutive register copy, so use the ordinary + // extract_vector_elt lowering unless we can use the shuffle. + // + // TODO: This is a bit of hack, and we should probably always use + // extract_subvector for the largest possible subvector we can (or at least + // use it for PackVT aligned pieces). However we have worse support for + // combines on them don't directly treat extract_subvector / insert_subvector + // as legal. The DAG scheduler also ends up doing a worse job with the + // extract_subvectors. + const bool ShouldUseConsecutiveExtract = EltVT.getSizeInBits() == 16; + // vector_shuffle <0,1,6,7> lhs, rhs // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2) // @@ -7608,9 +7632,18 @@ SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op, // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0) // Avoid scalarizing when both halves are reading from consecutive elements. - SmallVector Pieces; + + // If we're treating 2 element shuffles as legal, also create odd-to-even + // shuffles of neighboring pairs. + // + // vector_shuffle <3,2,7,6> lhs, rhs + // -> concat_vectors vector_shuffle <1, 0> (extract_subvector lhs, 0) + // vector_shuffle <1, 0> (extract_subvector rhs, 2) + + SmallVector Pieces; for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) { - if (elementPairIsContiguous(SVN->getMask(), I)) { + if (ShouldUseConsecutiveExtract && + elementPairIsContiguous(SVN->getMask(), I)) { const int Idx = SVN->getMaskElt(I); int VecIdx = Idx < SrcNumElts ? 0 : 1; int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts; @@ -7618,6 +7651,48 @@ SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op, SVN->getOperand(VecIdx), DAG.getConstant(EltIdx, SL, MVT::i32)); Pieces.push_back(SubVec); + } else if (elementPairIsOddToEven(SVN->getMask(), I) && + isOperationLegal(ISD::VECTOR_SHUFFLE, PackVT)) { + int Idx0 = SVN->getMaskElt(I); + int Idx1 = SVN->getMaskElt(I + 1); + + SDValue SrcOp0 = SVN->getOperand(0); + SDValue SrcOp1 = SrcOp0; + if (Idx0 >= SrcNumElts) { + SrcOp0 = SVN->getOperand(1); + Idx0 -= SrcNumElts; + } + + if (Idx1 >= SrcNumElts) { + SrcOp1 = SVN->getOperand(1); + Idx1 -= SrcNumElts; + } + + int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1); + int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1); + + // Extract nearest even aligned piece. + SDValue SubVec0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp0, + DAG.getConstant(AlignedIdx0, SL, MVT::i32)); + SDValue SubVec1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp1, + DAG.getConstant(AlignedIdx1, SL, MVT::i32)); + + int NewMaskIdx0 = Idx0 - AlignedIdx0; + int NewMaskIdx1 = Idx1 - AlignedIdx1; + + SDValue Result0 = SubVec0; + SDValue Result1 = SubVec0; + + if (SubVec0 != SubVec1) { + NewMaskIdx1 += NewSrcNumElts; + Result1 = SubVec1; + } else { + Result1 = DAG.getUNDEF(PackVT); + } + + SDValue Shuf = DAG.getVectorShuffle(PackVT, SL, Result0, Result1, + {NewMaskIdx0, NewMaskIdx1}); + Pieces.push_back(Shuf); } else { const int Idx0 = SVN->getMaskElt(I); const int Idx1 = SVN->getMaskElt(I + 1);