From 3c657838ee76b92695694c39544c142dbe36a67b Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 14 Feb 2025 13:39:43 +0000 Subject: [PATCH] [DAG] replaceShuffleOfInsert - add support for shuffle_vector(scalar_to_vector(x),y) -> insert_vector_elt(y,x,c) Begin extending replaceShuffleOfInsert to handle other forms of scalar insertion into a vector. I've limited this to targets that just have Custom/Legal ISD::INSERT_VECTOR_ELT handling for now - although we can probably always fold this before LegalOperations. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 18 +++++++--- .../PowerPC/v4i32_scalar_to_vector_shuffle.ll | 34 ++++++------------- 2 files changed, 25 insertions(+), 27 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index c6fd72b6b76f4..82c4cbf793ee7 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -626,6 +626,7 @@ namespace { SDValue CombineZExtLogicopShiftLoad(SDNode *N); SDValue combineRepeatedFPDivisors(SDNode *N); SDValue combineFMulOrFDivWithIntPow2(SDNode *N); + SDValue replaceShuffleOfInsert(ShuffleVectorSDNode *Shuf); SDValue mergeInsertEltWithShuffle(SDNode *N, unsigned InsIndex); SDValue combineInsertEltToShuffle(SDNode *N, unsigned InsIndex); SDValue combineInsertEltToLoad(SDNode *N, unsigned InsIndex); @@ -26102,8 +26103,7 @@ static int getShuffleMaskIndexOfOneElementFromOp0IntoOp1(ArrayRef Mask) { /// If a shuffle inserts exactly one element from a source vector operand into /// another vector operand and we can access the specified element as a scalar, /// then we can eliminate the shuffle. -static SDValue replaceShuffleOfInsert(ShuffleVectorSDNode *Shuf, - SelectionDAG &DAG) { +SDValue DAGCombiner::replaceShuffleOfInsert(ShuffleVectorSDNode *Shuf) { // First, check if we are taking one element of a vector and shuffling that // element into another vector. ArrayRef Mask = Shuf->getMask(); @@ -26126,7 +26126,7 @@ static SDValue replaceShuffleOfInsert(ShuffleVectorSDNode *Shuf, // Now see if we can access that element as a scalar via a real insert element // instruction. // TODO: We can try harder to locate the element as a scalar. Examples: it - // could be an operand of SCALAR_TO_VECTOR, BUILD_VECTOR, or a constant. + // could be an operand of BUILD_VECTOR, or a constant. assert(Mask[ShufOp0Index] >= 0 && Mask[ShufOp0Index] < (int)Mask.size() && "Shuffle mask value must be from operand 0"); @@ -26149,6 +26149,16 @@ static SDValue replaceShuffleOfInsert(ShuffleVectorSDNode *Shuf, Op1, Elt, NewInsIndex); } + if (!hasOperation(ISD::INSERT_VECTOR_ELT, Op0.getValueType())) + return SDValue(); + + if (sd_match(Op0, m_UnaryOp(ISD::SCALAR_TO_VECTOR, m_Value(Elt))) && + Mask[ShufOp0Index] == 0) { + SDValue NewInsIndex = DAG.getVectorIdxConstant(ShufOp0Index, SDLoc(Shuf)); + return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Shuf), Op0.getValueType(), + Op1, Elt, NewInsIndex); + } + return SDValue(); } @@ -26220,7 +26230,7 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { return DAG.getVectorShuffle(VT, SDLoc(N), N0, N1, NewMask); } - if (SDValue InsElt = replaceShuffleOfInsert(SVN, DAG)) + if (SDValue InsElt = replaceShuffleOfInsert(SVN)) return InsElt; // A shuffle of a single vector that is a splatted value can always be folded. diff --git a/llvm/test/CodeGen/PowerPC/v4i32_scalar_to_vector_shuffle.ll b/llvm/test/CodeGen/PowerPC/v4i32_scalar_to_vector_shuffle.ll index 402a4f34e62b2..d98b78dfdd3b0 100644 --- a/llvm/test/CodeGen/PowerPC/v4i32_scalar_to_vector_shuffle.ll +++ b/llvm/test/CodeGen/PowerPC/v4i32_scalar_to_vector_shuffle.ll @@ -239,13 +239,10 @@ define void @test_none_v4i32(<2 x i32> %vec, ptr %ptr1) { ; ; CHECK-LE-P9-LABEL: test_none_v4i32: ; CHECK-LE-P9: # %bb.0: # %entry -; CHECK-LE-P9-NEXT: li r3, 0 -; CHECK-LE-P9-NEXT: vextuwrx r3, r3, v2 -; CHECK-LE-P9-NEXT: mtfprwz f0, r3 ; CHECK-LE-P9-NEXT: addis r3, r2, .LCPI2_0@toc@ha ; CHECK-LE-P9-NEXT: addi r3, r3, .LCPI2_0@toc@l -; CHECK-LE-P9-NEXT: lxv vs1, 0(r3) -; CHECK-LE-P9-NEXT: xxperm v2, vs0, vs1 +; CHECK-LE-P9-NEXT: lxv vs0, 0(r3) +; CHECK-LE-P9-NEXT: xxperm v2, v2, vs0 ; CHECK-LE-P9-NEXT: stxv v2, 0(r5) ; CHECK-LE-P9-NEXT: blr ; @@ -263,14 +260,11 @@ define void @test_none_v4i32(<2 x i32> %vec, ptr %ptr1) { ; ; CHECK-BE-P9-LABEL: test_none_v4i32: ; CHECK-BE-P9: # %bb.0: # %entry -; CHECK-BE-P9-NEXT: li r3, 0 -; CHECK-BE-P9-NEXT: vextuwlx r3, r3, v2 -; CHECK-BE-P9-NEXT: mtfprwz f0, r3 ; CHECK-BE-P9-NEXT: addis r3, r2, .LCPI2_0@toc@ha ; CHECK-BE-P9-NEXT: addi r3, r3, .LCPI2_0@toc@l -; CHECK-BE-P9-NEXT: lxv vs1, 0(r3) -; CHECK-BE-P9-NEXT: xxperm vs0, v2, vs1 -; CHECK-BE-P9-NEXT: stxv vs0, 0(r5) +; CHECK-BE-P9-NEXT: lxv vs0, 0(r3) +; CHECK-BE-P9-NEXT: xxperm v2, v2, vs0 +; CHECK-BE-P9-NEXT: stxv v2, 0(r5) ; CHECK-BE-P9-NEXT: blr ; ; CHECK-AIX-64-P8-LABEL: test_none_v4i32: @@ -286,13 +280,10 @@ define void @test_none_v4i32(<2 x i32> %vec, ptr %ptr1) { ; ; CHECK-AIX-64-P9-LABEL: test_none_v4i32: ; CHECK-AIX-64-P9: # %bb.0: # %entry -; CHECK-AIX-64-P9-NEXT: li r4, 0 -; CHECK-AIX-64-P9-NEXT: vextuwlx r4, r4, v2 -; CHECK-AIX-64-P9-NEXT: mtfprwz f0, r4 ; CHECK-AIX-64-P9-NEXT: ld r4, L..C1(r2) # %const.0 -; CHECK-AIX-64-P9-NEXT: lxv vs1, 0(r4) -; CHECK-AIX-64-P9-NEXT: xxperm vs0, v2, vs1 -; CHECK-AIX-64-P9-NEXT: stxv vs0, 0(r3) +; CHECK-AIX-64-P9-NEXT: lxv vs0, 0(r4) +; CHECK-AIX-64-P9-NEXT: xxperm v2, v2, vs0 +; CHECK-AIX-64-P9-NEXT: stxv v2, 0(r3) ; CHECK-AIX-64-P9-NEXT: blr ; ; CHECK-AIX-32-P8-LABEL: test_none_v4i32: @@ -308,13 +299,10 @@ define void @test_none_v4i32(<2 x i32> %vec, ptr %ptr1) { ; ; CHECK-AIX-32-P9-LABEL: test_none_v4i32: ; CHECK-AIX-32-P9: # %bb.0: # %entry -; CHECK-AIX-32-P9-NEXT: addi r4, r1, -16 -; CHECK-AIX-32-P9-NEXT: stxv v2, -16(r1) -; CHECK-AIX-32-P9-NEXT: lfiwzx f0, 0, r4 ; CHECK-AIX-32-P9-NEXT: lwz r4, L..C1(r2) # %const.0 -; CHECK-AIX-32-P9-NEXT: lxv vs1, 0(r4) -; CHECK-AIX-32-P9-NEXT: xxperm vs0, v2, vs1 -; CHECK-AIX-32-P9-NEXT: stxv vs0, 0(r3) +; CHECK-AIX-32-P9-NEXT: lxv vs0, 0(r4) +; CHECK-AIX-32-P9-NEXT: xxperm v2, v2, vs0 +; CHECK-AIX-32-P9-NEXT: stxv v2, 0(r3) ; CHECK-AIX-32-P9-NEXT: blr entry: %0 = extractelement <2 x i32> %vec, i64 0