[VectorCombine] Combine extract/insert from vector #115213

ParkHanbum · 2024-11-06T21:17:21Z

insert (DstVec, (extract SrcVec, ExtIdx), InsIdx)
--> shuffle (DstVec, SrcVec, Mask)

This commit combines extract/insert on a vector into Shuffle with
vector.

llvmbot · 2024-11-06T21:17:56Z

@llvm/pr-subscribers-llvm-transforms

@llvm/pr-subscribers-vectorizers

Author: hanbeom (ParkHanbum)

Changes

insert (DstVec, (extract (binop), ExtIdx), InsIdx) --> shuffl (DstVec, (binop), Mask)

This commit combines extract/insert with BinaryOperation on a vector into Shuffle+BinaryOperation with vector.

Patch is 24.73 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/115213.diff

5 Files Affected:

(modified) llvm/lib/Transforms/Vectorize/VectorCombine.cpp (+48)
(modified) llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll (+58-29)
(modified) llvm/test/Transforms/VectorCombine/X86/extract-binop.ll (+58-29)
(modified) llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll (+26-13)
(modified) llvm/test/Transforms/VectorCombine/X86/load.ll (+26-13)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 58145c7e3c5913..0ccd535303686d 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -106,6 +106,7 @@ class VectorCombine {
                        Instruction &I);
   bool foldExtractExtract(Instruction &I);
   bool foldInsExtFNeg(Instruction &I);
+  bool foldInsExtOfBinOpShuffle(Instruction &I);
   bool foldBitcastShuffle(Instruction &I);
   bool scalarizeBinopOrCmp(Instruction &I);
   bool scalarizeVPIntrinsic(Instruction &I);
@@ -2678,6 +2679,52 @@ bool VectorCombine::shrinkType(llvm::Instruction &I) {
   return true;
 }
 
+/// insert (DstVec, (extract (binop), ExtIdx), InsIdx) -->
+/// shuffl (DstVec, (binop), Mask)
+bool VectorCombine::foldInsExtOfBinOpShuffle(Instruction &I) {
+  Value *DstVec;
+  BinaryOperator *BO;
+  uint64_t ExtIdx, InsIdx;
+  if (!match(&I, m_InsertElt(
+                     m_Value(DstVec),
+                     m_OneUse(m_ExtractElt(m_BinOp(BO), m_ConstantInt(ExtIdx))),
+                     m_ConstantInt(InsIdx))))
+    return false;
+
+  if (!isSafeToSpeculativelyExecute(BO))
+    return false;
+
+  auto *VecTy = cast<FixedVectorType>(I.getType());
+  if (BO->getType() != VecTy)
+    return false;
+
+  unsigned NumElts = VecTy->getNumElements();
+  if (ExtIdx >= NumElts)
+    return false;
+
+  SmallVector<int> Mask(NumElts);
+  std::iota(Mask.begin(), Mask.end(), 0);
+  Mask[InsIdx] = ExtIdx + NumElts;
+  // Cost
+  ExtractElementInst *Ext;
+  if ((Ext = dyn_cast<ExtractElementInst>(I.getOperand(0))) == nullptr)
+    Ext = dyn_cast<ExtractElementInst>(I.getOperand(1));
+
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  InstructionCost OldCost =
+      TTI.getVectorInstrCost(*Ext, VecTy, CostKind, ExtIdx);
+  InstructionCost NewCost =
+      TTI.getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask);
+
+  if (OldCost < NewCost)
+    return false;
+
+  Value *Shuf = Builder.CreateShuffleVector(DstVec, BO, Mask);
+  replaceValue(I, *Shuf);
+
+  return true;
+}
+
 /// This is the entry point for all transforms. Pass manager differences are
 /// handled in the callers of this function.
 bool VectorCombine::run() {
@@ -2734,6 +2781,7 @@ bool VectorCombine::run() {
       switch (Opcode) {
       case Instruction::InsertElement:
         MadeChange |= foldInsExtFNeg(I);
+        MadeChange |= foldInsExtOfBinOpShuffle(I);
         break;
       case Instruction::ShuffleVector:
         MadeChange |= foldShuffleOfBinops(I);
diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll
index 3d69f15fc5f249..e5880c93a9020f 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll
@@ -417,12 +417,18 @@ define float @ext14_ext15_fmul_v16f32(<16 x float> %x) {
 }
 
 define <4 x float> @ins_bo_ext_ext(<4 x float> %a, <4 x float> %b) {
-; CHECK-LABEL: @ins_bo_ext_ext(
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[SHIFT]], [[A]]
-; CHECK-NEXT:    [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i64 3
-; CHECK-NEXT:    [[V3:%.*]] = insertelement <4 x float> [[B:%.*]], float [[A23]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[V3]]
+; SSE-LABEL: @ins_bo_ext_ext(
+; SSE-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
+; SSE-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[SHIFT]], [[A]]
+; SSE-NEXT:    [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i64 3
+; SSE-NEXT:    [[V3:%.*]] = insertelement <4 x float> [[B:%.*]], float [[A23]], i32 3
+; SSE-NEXT:    ret <4 x float> [[V3]]
+;
+; AVX-LABEL: @ins_bo_ext_ext(
+; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
+; AVX-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[SHIFT]], [[A]]
+; AVX-NEXT:    [[V3:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; AVX-NEXT:    ret <4 x float> [[V3]]
 ;
   %a2 = extractelement <4 x float> %a, i32 2
   %a3 = extractelement <4 x float> %a, i32 3
@@ -435,13 +441,21 @@ define <4 x float> @ins_bo_ext_ext(<4 x float> %a, <4 x float> %b) {
 ;       but it is likely that extracting from index 3 is the better option.
 
 define <4 x float> @ins_bo_ext_ext_uses(<4 x float> %a, <4 x float> %b) {
-; CHECK-LABEL: @ins_bo_ext_ext_uses(
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
-; CHECK-NEXT:    [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    call void @use_f32(float [[A23]])
-; CHECK-NEXT:    [[V3:%.*]] = insertelement <4 x float> [[B:%.*]], float [[A23]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[V3]]
+; SSE-LABEL: @ins_bo_ext_ext_uses(
+; SSE-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; SSE-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
+; SSE-NEXT:    [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; SSE-NEXT:    call void @use_f32(float [[A23]])
+; SSE-NEXT:    [[V3:%.*]] = insertelement <4 x float> [[B:%.*]], float [[A23]], i32 3
+; SSE-NEXT:    ret <4 x float> [[V3]]
+;
+; AVX-LABEL: @ins_bo_ext_ext_uses(
+; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; AVX-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
+; AVX-NEXT:    [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; AVX-NEXT:    call void @use_f32(float [[A23]])
+; AVX-NEXT:    [[V3:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 6>
+; AVX-NEXT:    ret <4 x float> [[V3]]
 ;
   %a2 = extractelement <4 x float> %a, i32 2
   %a3 = extractelement <4 x float> %a, i32 3
@@ -452,22 +466,37 @@ define <4 x float> @ins_bo_ext_ext_uses(<4 x float> %a, <4 x float> %b) {
 }
 
 define <4 x float> @PR34724(<4 x float> %a, <4 x float> %b) {
-; CHECK-LABEL: @PR34724(
-; CHECK-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
-; CHECK-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
-; CHECK-NEXT:    [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[B]], [[SHIFT1]]
-; CHECK-NEXT:    [[B01:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
-; CHECK-NEXT:    [[SHIFT2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[SHIFT2]], [[B]]
-; CHECK-NEXT:    [[B23:%.*]] = extractelement <4 x float> [[TMP3]], i64 3
-; CHECK-NEXT:    [[V1:%.*]] = insertelement <4 x float> poison, float [[A23]], i32 1
-; CHECK-NEXT:    [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[B01]], i32 2
-; CHECK-NEXT:    [[V3:%.*]] = insertelement <4 x float> [[V2]], float [[B23]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[V3]]
+; SSE-LABEL: @PR34724(
+; SSE-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; SSE-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
+; SSE-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; SSE-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
+; SSE-NEXT:    [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; SSE-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[B]], [[SHIFT1]]
+; SSE-NEXT:    [[B01:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; SSE-NEXT:    [[SHIFT2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
+; SSE-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[SHIFT2]], [[B]]
+; SSE-NEXT:    [[B23:%.*]] = extractelement <4 x float> [[TMP3]], i64 3
+; SSE-NEXT:    [[V1:%.*]] = insertelement <4 x float> poison, float [[A23]], i32 1
+; SSE-NEXT:    [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[B01]], i32 2
+; SSE-NEXT:    [[V3:%.*]] = insertelement <4 x float> [[V2]], float [[B23]], i32 3
+; SSE-NEXT:    ret <4 x float> [[V3]]
+;
+; AVX-LABEL: @PR34724(
+; AVX-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; AVX-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
+; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; AVX-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
+; AVX-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[B]], [[SHIFT1]]
+; AVX-NEXT:    [[B01:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; AVX-NEXT:    [[SHIFT2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
+; AVX-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[SHIFT2]], [[B]]
+; AVX-NEXT:    [[V1:%.*]] = shufflevector <4 x float> poison, <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 6, i32 2, i32 3>
+; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[B01]], i32 2
+; AVX-NEXT:    [[V3:%.*]] = shufflevector <4 x float> [[V2]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; AVX-NEXT:    ret <4 x float> [[V3]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
   %a1 = extractelement <4 x float> %a, i32 1
diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
index 52f7cd859a1ab1..49a636c1f804d0 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
@@ -417,12 +417,18 @@ define float @ext14_ext15_fmul_v16f32(<16 x float> %x) {
 }
 
 define <4 x float> @ins_bo_ext_ext(<4 x float> %a, <4 x float> %b) {
-; CHECK-LABEL: @ins_bo_ext_ext(
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[SHIFT]], [[A]]
-; CHECK-NEXT:    [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i64 3
-; CHECK-NEXT:    [[V3:%.*]] = insertelement <4 x float> [[B:%.*]], float [[A23]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[V3]]
+; SSE-LABEL: @ins_bo_ext_ext(
+; SSE-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
+; SSE-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[SHIFT]], [[A]]
+; SSE-NEXT:    [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i64 3
+; SSE-NEXT:    [[V3:%.*]] = insertelement <4 x float> [[B:%.*]], float [[A23]], i32 3
+; SSE-NEXT:    ret <4 x float> [[V3]]
+;
+; AVX-LABEL: @ins_bo_ext_ext(
+; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
+; AVX-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[SHIFT]], [[A]]
+; AVX-NEXT:    [[V3:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; AVX-NEXT:    ret <4 x float> [[V3]]
 ;
   %a2 = extractelement <4 x float> %a, i32 2
   %a3 = extractelement <4 x float> %a, i32 3
@@ -435,13 +441,21 @@ define <4 x float> @ins_bo_ext_ext(<4 x float> %a, <4 x float> %b) {
 ;       but it is likely that extracting from index 3 is the better option.
 
 define <4 x float> @ins_bo_ext_ext_uses(<4 x float> %a, <4 x float> %b) {
-; CHECK-LABEL: @ins_bo_ext_ext_uses(
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
-; CHECK-NEXT:    [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    call void @use_f32(float [[A23]])
-; CHECK-NEXT:    [[V3:%.*]] = insertelement <4 x float> [[B:%.*]], float [[A23]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[V3]]
+; SSE-LABEL: @ins_bo_ext_ext_uses(
+; SSE-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; SSE-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
+; SSE-NEXT:    [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; SSE-NEXT:    call void @use_f32(float [[A23]])
+; SSE-NEXT:    [[V3:%.*]] = insertelement <4 x float> [[B:%.*]], float [[A23]], i32 3
+; SSE-NEXT:    ret <4 x float> [[V3]]
+;
+; AVX-LABEL: @ins_bo_ext_ext_uses(
+; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; AVX-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
+; AVX-NEXT:    [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; AVX-NEXT:    call void @use_f32(float [[A23]])
+; AVX-NEXT:    [[V3:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 6>
+; AVX-NEXT:    ret <4 x float> [[V3]]
 ;
   %a2 = extractelement <4 x float> %a, i32 2
   %a3 = extractelement <4 x float> %a, i32 3
@@ -452,22 +466,37 @@ define <4 x float> @ins_bo_ext_ext_uses(<4 x float> %a, <4 x float> %b) {
 }
 
 define <4 x float> @PR34724(<4 x float> %a, <4 x float> %b) {
-; CHECK-LABEL: @PR34724(
-; CHECK-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
-; CHECK-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
-; CHECK-NEXT:    [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[B]], [[SHIFT1]]
-; CHECK-NEXT:    [[B01:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
-; CHECK-NEXT:    [[SHIFT2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[SHIFT2]], [[B]]
-; CHECK-NEXT:    [[B23:%.*]] = extractelement <4 x float> [[TMP3]], i64 3
-; CHECK-NEXT:    [[V1:%.*]] = insertelement <4 x float> undef, float [[A23]], i32 1
-; CHECK-NEXT:    [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[B01]], i32 2
-; CHECK-NEXT:    [[V3:%.*]] = insertelement <4 x float> [[V2]], float [[B23]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[V3]]
+; SSE-LABEL: @PR34724(
+; SSE-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; SSE-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
+; SSE-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; SSE-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
+; SSE-NEXT:    [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; SSE-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[B]], [[SHIFT1]]
+; SSE-NEXT:    [[B01:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; SSE-NEXT:    [[SHIFT2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
+; SSE-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[SHIFT2]], [[B]]
+; SSE-NEXT:    [[B23:%.*]] = extractelement <4 x float> [[TMP3]], i64 3
+; SSE-NEXT:    [[V1:%.*]] = insertelement <4 x float> undef, float [[A23]], i32 1
+; SSE-NEXT:    [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[B01]], i32 2
+; SSE-NEXT:    [[V3:%.*]] = insertelement <4 x float> [[V2]], float [[B23]], i32 3
+; SSE-NEXT:    ret <4 x float> [[V3]]
+;
+; AVX-LABEL: @PR34724(
+; AVX-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; AVX-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
+; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; AVX-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
+; AVX-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[B]], [[SHIFT1]]
+; AVX-NEXT:    [[B01:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; AVX-NEXT:    [[SHIFT2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
+; AVX-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[SHIFT2]], [[B]]
+; AVX-NEXT:    [[V1:%.*]] = shufflevector <4 x float> undef, <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 6, i32 2, i32 3>
+; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[B01]], i32 2
+; AVX-NEXT:    [[V3:%.*]] = shufflevector <4 x float> [[V2]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; AVX-NEXT:    ret <4 x float> [[V3]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
   %a1 = extractelement <4 x float> %a, i32 1
diff --git a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
index c4aba63568e2ff..e99e21641531ab 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
@@ -537,19 +537,32 @@ define <2 x float> @load_f32_insert_v2f32_asan(ptr align 16 dereferenceable(16)
 
 declare ptr @getscaleptr()
 define void @PR47558_multiple_use_load(ptr nocapture nonnull %resultptr, ptr nocapture nonnull readonly %opptr) nofree nosync {
-; CHECK-LABEL: @PR47558_multiple_use_load(
-; CHECK-NEXT:    [[SCALEPTR:%.*]] = tail call nonnull align 16 dereferenceable(64) ptr @getscaleptr()
-; CHECK-NEXT:    [[OP:%.*]] = load <2 x float>, ptr [[OPPTR:%.*]], align 4
-; CHECK-NEXT:    [[SCALE:%.*]] = load float, ptr [[SCALEPTR]], align 16
-; CHECK-NEXT:    [[T1:%.*]] = insertelement <2 x float> poison, float [[SCALE]], i32 0
-; CHECK-NEXT:    [[T2:%.*]] = insertelement <2 x float> [[T1]], float [[SCALE]], i32 1
-; CHECK-NEXT:    [[T3:%.*]] = fmul <2 x float> [[OP]], [[T2]]
-; CHECK-NEXT:    [[T4:%.*]] = extractelement <2 x float> [[T3]], i32 0
-; CHECK-NEXT:    [[RESULT0:%.*]] = insertelement <2 x float> poison, float [[T4]], i32 0
-; CHECK-NEXT:    [[T5:%.*]] = extractelement <2 x float> [[T3]], i32 1
-; CHECK-NEXT:    [[RESULT1:%.*]] = insertelement <2 x float> [[RESULT0]], float [[T5]], i32 1
-; CHECK-NEXT:    store <2 x float> [[RESULT1]], ptr [[RESULTPTR:%.*]], align 8
-; CHECK-NEXT:    ret void
+; SSE2-LABEL: @PR47558_multiple_use_load(
+; SSE2-NEXT:    [[SCALEPTR:%.*]] = tail call nonnull align 16 dereferenceable(64) ptr @getscaleptr()
+; SSE2-NEXT:    [[OP:%.*]] = load <2 x float>, ptr [[OPPTR:%.*]], align 4
+; SSE2-NEXT:    [[SCALE:%.*]] = load float, ptr [[SCALEPTR]], align 16
+; SSE2-NEXT:    [[T1:%.*]] = insertelement <2 x float> poison, float [[SCALE]], i32 0
+; SSE2-NEXT:    [[T2:%.*]] = insertelement <2 x float> [[T1]], float [[SCALE]], i32 1
+; SSE2-NEXT:    [[T3:%.*]] = fmul <2 x float> [[OP]], [[T2]]
+; SSE2-NEXT:    [[T4:%.*]] = extractelement <2 x float> [[T3]], i32 0
+; SSE2-NEXT:    [[RESULT0:%.*]] =...
[truncated]

mshockwave · 2024-11-07T00:49:17Z

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

+  SmallVector<int> Mask(NumElts);
+  std::iota(Mask.begin(), Mask.end(), 0);


please just use SmallVector (size_t Size, const T &Value)

Its needs the iota for the ascending shuffle mask - its not a splat.

I couldn't catch up what you mean.
@mshockwave you mean it is better to use SmallVector with initial value? or it can be replace from std::iota?
@RKSimon what is mean splat?

A splat is when the same value is assigned to every element - but for this shuffle mask you need <0, 1, 2, 3, ..., N-1> which is what iota will give you.

I see! thakns for letting me know

my bad, I misunderstood what std::iota does

mshockwave · 2024-11-07T00:51:01Z

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

+  Mask[InsIdx] = ExtIdx + NumElts;
+  // Cost
+  ExtractElementInst *Ext;
+  if ((Ext = dyn_cast<ExtractElementInst>(I.getOperand(0))) == nullptr)


if (!isa<ExtractElementInst>(I.getOperand(0))) ?

mshockwave · 2024-11-07T00:52:46Z

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

+  if (!isSafeToSpeculativelyExecute(BO))
+    return false;
+
+  auto *VecTy = cast<FixedVectorType>(I.getType());


this will crash if I has scalable vector type. You can limit this combine rule to fixed vector only

auto *VecTy = dyn_cast<FixedVectorType>(I.getType()); if (!VecTy || BO->getType() != VecTy) return false;

RKSimon

I don't understand why you require the BinOp for this - I'd be tempted to just make this about a insertelement(X, extractelement(Y,C1), C2 -> shuffle X, Y fold.

RKSimon · 2024-11-07T10:51:58Z

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

 }

+/// insert (DstVec, (extract (binop), ExtIdx), InsIdx) -->
+/// shuffl (DstVec, (binop), Mask)


RKSimon · 2024-11-07T10:52:49Z

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

+  if (!isSafeToSpeculativelyExecute(BO))
+    return false;
+
+  auto *VecTy = cast<FixedVectorType>(I.getType());


auto *VecTy = dyn_cast<FixedVectorType>(I.getType()); if (!VecTy || BO->getType() != VecTy) return false;

RKSimon · 2024-11-07T10:55:04Z

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

+  SmallVector<int> Mask(NumElts);
+  std::iota(Mask.begin(), Mask.end(), 0);


Its needs the iota for the ascending shuffle mask - its not a splat.

github-actions · 2024-11-07T16:46:49Z

✅ With the latest revision this PR passed the C/C++ code formatter.

RKSimon · 2024-11-08T10:48:22Z

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

+    return false;
+
+  unsigned NumElts = VecTy->getNumElements();
+  if (ExtIdx >= NumElts)


if (ExtIdx >= NumElts || InsIdx >= NumElts)

RKSimon · 2024-11-08T10:54:04Z

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

+  ExtractElementInst *Ext;
+  Ext = isa<ExtractElementInst>(I.getOperand(0))
+            ? cast<ExtractElementInst>(I.getOperand(0))
+            : cast<ExtractElementInst>(I.getOperand(1));


The match above has confirmed that I is an InsertElementInst - so this would be better as:

auto *Ins = cast<InsertElementInst>(&I): auto *Ext = cast<ExtractElementInst>(I.getOperand(1)) ;

RKSimon · 2024-11-08T10:55:01Z

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

+
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  InstructionCost OldCost =
+      TTI.getVectorInstrCost(*Ext, VecTy, CostKind, ExtIdx);


OldCost should account for the Ins and Ext:

InstructionCost OldCost = TTI.getVectorInstrCost(*Ext, VecTy, CostKind, ExtIdx) + TTI.getVectorInstrCost(*Ins, VecTy, CostKind, InsIdx);

RKSimon · 2024-11-08T10:56:59Z

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

+  InstructionCost OldCost =
+      TTI.getVectorInstrCost(*Ext, VecTy, CostKind, ExtIdx);
+  InstructionCost NewCost =
+      TTI.getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask);


SK_Select can only be used when InsIdx == ExtIdx - this needs to be SK_PermuteTwoSrc (improveShuffleKindFromMask is called internally by getShuffleCost and will convert it to SK_Select if valid)

I'm not familiar with vectorcombine so I made a mistake, thanks for letting me know and I'll try harder.

RKSimon · 2024-11-08T10:59:02Z

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

+  Value *DstVec, *SrcVec;
+  uint64_t ExtIdx, InsIdx;
+  if (!match(&I, m_InsertElt(m_Value(DstVec),
+                             m_OneUse(m_ExtractElt(m_Value(SrcVec),


We should be able to remove the m_OneUse if we can account for it in the NewCost below by adding the ExtractElementInst cost back if its has multiple uses.

RKSimon · 2024-11-11T11:15:52Z

@ParkHanbum Please can you merge with trunk again to see if we can get the CI to run green ?

insert (DstVec, (extract SrcVec, ExtIdx), InsIdx) --> shuffle (DstVec, SrcVec, Mask) This commit combines extract/insert on a vector into Shuffle with vector.

RKSimon

LGTM - cheers!

RKSimon · 2024-11-12T13:15:54Z

@davemgreen any comments?

davemgreen · 2024-11-12T21:26:03Z

Hi - I think so long as it is cost-modelled it should be OK. Our shuffle generation is not going to be optimal at times (if it doesn't go though perfect shuffle tables), but the costs should be equally high as far as I understand.

llvmbot added vectorizers llvm:transforms labels Nov 6, 2024

mshockwave reviewed Nov 7, 2024

View reviewed changes

mshockwave requested a review from RKSimon November 7, 2024 00:59

RKSimon reviewed Nov 7, 2024

View reviewed changes

ParkHanbum force-pushed the vector_combine2 branch from 5afe48e to 290e9c5 Compare November 7, 2024 16:42

ParkHanbum changed the title ~~[VectorCombine] Combine BinOp with extract/insert to vector BinOp~~ [VectorCombine] Combine extract/insert from vector Nov 7, 2024

ParkHanbum force-pushed the vector_combine2 branch from 290e9c5 to 5ae995e Compare November 7, 2024 16:54

ParkHanbum requested review from RKSimon and mshockwave November 7, 2024 19:19

RKSimon reviewed Nov 8, 2024

View reviewed changes

ParkHanbum force-pushed the vector_combine2 branch from 5ae995e to 78bf832 Compare November 8, 2024 18:23

ParkHanbum requested a review from RKSimon November 9, 2024 04:42

ParkHanbum added 8 commits November 12, 2024 01:24

[VectorCombine] Combine extract/insert from vector

cca6bdb

insert (DstVec, (extract SrcVec, ExtIdx), InsIdx) --> shuffle (DstVec, SrcVec, Mask) This commit combines extract/insert on a vector into Shuffle with vector.

use isa to get ExtractElts from current Instruction

dad00aa

use SmallVector with initial value

bacd8ae

add combining condition for InsIdx >= NumEls

04af5c4

Change the way to get ExtElts

191345e

Correcting incorrect cost calculation

681766d

Handling the case when ExtElt is not the OneUse

869a5d5

update testcases

362e704

ParkHanbum force-pushed the vector_combine2 branch from 78bf832 to 362e704 Compare November 11, 2024 16:25

RKSimon approved these changes Nov 12, 2024

View reviewed changes

RKSimon requested a review from davemgreen November 12, 2024 13:15

RKSimon merged commit d942f5e into llvm:main Nov 13, 2024
8 checks passed

ParkHanbum deleted the vector_combine2 branch December 20, 2024 17:56

		SmallVector<int> Mask(NumElts);
		std::iota(Mask.begin(), Mask.end(), 0);

[VectorCombine] Combine extract/insert from vector #115213

[VectorCombine] Combine extract/insert from vector #115213

Uh oh!

Conversation

ParkHanbum commented Nov 6, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Nov 6, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

RKSimon left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

github-actions bot commented Nov 7, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

RKSimon commented Nov 11, 2024

Uh oh!

RKSimon left a comment

Choose a reason for hiding this comment

Uh oh!

RKSimon commented Nov 12, 2024

Uh oh!

davemgreen commented Nov 12, 2024

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

5 participants

ParkHanbum commented Nov 6, 2024 •

edited

Loading

llvmbot commented Nov 6, 2024 •

edited

Loading

github-actions bot commented Nov 7, 2024 •

edited

Loading