[SLP] Support for copyables in the reduced values #153589

alexey-bataev · 2025-08-14T14:52:48Z

Currently reductions can handles only same/alternate instructions,
skipping potential support for copyables. Patch adds support for
copyables in the reduced values.

Created using spr 1.3.5

llvmbot · 2025-08-14T14:53:19Z

@llvm/pr-subscribers-backend-risc-v
@llvm/pr-subscribers-vectorizers

@llvm/pr-subscribers-llvm-transforms

Author: Alexey Bataev (alexey-bataev)

Changes

Currently reductions can handles only same/alternate instructions,
skipping potential support for copyables. Patch adds support for
copyables in the reduced values.

Full diff: https://github.com/llvm/llvm-project/pull/153589.diff

7 Files Affected:

(modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+47-6)
(modified) llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll (+20-16)
(modified) llvm/test/Transforms/SLPVectorizer/X86/multi-tracked-reduced-value.ll (+1-1)
(modified) llvm/test/Transforms/SLPVectorizer/X86/reduction-extracted-value.ll (+2-2)
(modified) llvm/test/Transforms/SLPVectorizer/X86/reduction-value-in-tree.ll (+1-1)
(modified) llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll (+5-4)
(modified) llvm/test/Transforms/SLPVectorizer/operand-is-reduced-val.ll (+35-21)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index a3cb4d138789c..3e64917bd7739 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -23783,7 +23783,8 @@ class HorizontalReduction {
 
   /// Attempt to vectorize the tree found by matchAssociativeReduction.
   Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
-                     const TargetLibraryInfo &TLI, AssumptionCache *AC) {
+                     const TargetLibraryInfo &TLI, AssumptionCache *AC,
+                     DominatorTree &DT) {
     constexpr unsigned RegMaxNumber = 4;
     constexpr unsigned RedValsMaxNumber = 128;
     // If there are a sufficient number of reduction values, reduce
@@ -23900,8 +23901,44 @@ class HorizontalReduction {
     bool CheckForReusedReductionOps = false;
     // Try to vectorize elements based on their type.
     SmallVector<InstructionsState> States;
-    for (ArrayRef<Value *> RV : ReducedVals)
+    SmallVector<SmallVector<Value*>> LocalReducedVals;
+    for (ArrayRef<Value *> RV : ReducedVals) {
+      // Loads are not very compatible with undefs.
+      if (isa<UndefValue>(RV.front()) &&
+          (States.empty() || !States.back() ||
+           States.back().getOpcode() == Instruction::Load)) {
+        LocalReducedVals.emplace_back().append(RV.begin(), RV.end());
+        States.push_back(InstructionsState::invalid());
+        continue;
+      }
+      if (!LocalReducedVals.empty() &&
+          isa<UndefValue>(LocalReducedVals.back().front()) &&
+          isa<LoadInst>(RV.front())) {
+        LocalReducedVals.emplace_back().append(RV.begin(), RV.end());
+        States.push_back(getSameOpcode(RV, TLI));
+        continue;
+      }
+      SmallVector<Value *> Ops;
+      if (!LocalReducedVals.empty())
+        Ops = LocalReducedVals.back();
+      Ops.append(RV.begin(), RV.end());
+      InstructionsCompatibilityAnalysis Analysis(DT, DL, *TTI, TLI);
+      InstructionsState OpS =
+          Analysis.buildInstructionsState(Ops, V, VectorizeCopyableElements);
+      if (LocalReducedVals.empty()) {
+        LocalReducedVals.push_back(Ops);
+        States.push_back(OpS);
+        continue;
+      }
+      if (OpS) {
+        LocalReducedVals.back().swap(Ops);
+        States.back() = OpS;
+        continue;
+      }
+      LocalReducedVals.emplace_back().append(RV.begin(), RV.end());
       States.push_back(getSameOpcode(RV, TLI));
+    }
+    ReducedVals.swap(LocalReducedVals);
     for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
       ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
       InstructionsState S = States[I];
@@ -23916,8 +23953,10 @@ class HorizontalReduction {
         // Also check if the instruction was folded to constant/other value.
         auto *Inst = dyn_cast<Instruction>(RdxVal);
         if ((Inst && isVectorLikeInstWithConstOps(Inst) &&
-             (!S || !S.getMatchingMainOpOrAltOp(Inst))) ||
-            (S && !Inst))
+             (!S || (!S.getMatchingMainOpOrAltOp(Inst) &&
+                     !S.isCopyableElement(Inst)))) ||
+            (S && !Inst && !isa<PoisonValue>(RdxVal) &&
+             !S.isCopyableElement(RdxVal)))
           continue;
         Candidates.push_back(RdxVal);
         TrackedToOrig.try_emplace(RdxVal, ReducedVal);
@@ -24503,6 +24542,8 @@ class HorizontalReduction {
       // Scalar cost is repeated for N-1 elements.
       int Cnt = ReducedVals.size();
       for (Value *RdxVal : ReducedVals) {
+        if (!isa<Instruction>(RdxVal))
+          continue;
         if (Cnt == 1)
           break;
         --Cnt;
@@ -25247,7 +25288,7 @@ bool SLPVectorizerPass::vectorizeHorReduction(
     HorizontalReduction HorRdx;
     if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
       return nullptr;
-    return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC);
+    return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC, *DT);
   };
   auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
     if (TryOperandsAsNewSeeds && FutureSeed == Root) {
@@ -25392,7 +25433,7 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
     if (RedCost >= ScalarCost)
       return false;
 
-    return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC) != nullptr;
+    return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC, *DT) != nullptr;
   };
   if (Candidates.size() == 1)
     return TryToReduce(I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1}, R);
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
index 143e09374a891..e3279290c3074 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
@@ -5,34 +5,38 @@
 define void @Test(i32) {
 ; CHECK-LABEL: @Test(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[TMP0:%.*]], i32 0
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[LOCAL_8_43_US:%.*]] = phi i32 [ [[VAL_43:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP6:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], <i32 0, i32 55, i32 285, i32 1240, i32 1496, i32 8555, i32 12529, i32 13685>
+; CHECK-NEXT:    [[LOCAL_8_43_US1:%.*]] = phi i32 [ [[VAL_44:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[LOCAL_8_43_US]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[LOCAL_8_43_US1]], i32 2
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP6]], <i32 0, i32 0, i32 55, i32 285, i32 1240, i32 1496, i32 8555, i32 12529>
+; CHECK-NEXT:    [[VAL_41:%.*]] = add i32 [[LOCAL_8_43_US1]], 13685
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP0:%.*]], [[TMP4]]
-; CHECK-NEXT:    [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[LOCAL_8_43_US]]
-; CHECK-NEXT:    [[VAL_43]] = add i32 [[LOCAL_8_43_US]], 14910
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[OP_RDX1]], i32 0
-; CHECK-NEXT:    [[TMP6]] = insertelement <2 x i32> [[TMP5]], i32 [[VAL_43]], i32 1
+; CHECK-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP4]], [[VAL_41]]
+; CHECK-NEXT:    [[VAL_43]] = and i32 [[OP_RDX]], [[LOCAL_8_43_US1]]
+; CHECK-NEXT:    [[VAL_44]] = add i32 [[LOCAL_8_43_US1]], 14910
 ; CHECK-NEXT:    br label [[LOOP]]
 ;
 ; FORCE_REDUCTION-LABEL: @Test(
 ; FORCE_REDUCTION-NEXT:  entry:
+; FORCE_REDUCTION-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[TMP0:%.*]], i32 0
 ; FORCE_REDUCTION-NEXT:    br label [[LOOP:%.*]]
 ; FORCE_REDUCTION:       loop:
 ; FORCE_REDUCTION-NEXT:    [[LOCAL_8_43_US:%.*]] = phi i32 [ [[VAL_43:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ]
-; FORCE_REDUCTION-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP6:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY]] ]
-; FORCE_REDUCTION-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; FORCE_REDUCTION-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], <i32 0, i32 55, i32 285, i32 1240, i32 1496, i32 8555, i32 12529, i32 13685>
+; FORCE_REDUCTION-NEXT:    [[LOCAL_8_43_US1:%.*]] = phi i32 [ [[VAL_44:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ]
+; FORCE_REDUCTION-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[LOCAL_8_43_US]], i32 1
+; FORCE_REDUCTION-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[LOCAL_8_43_US1]], i32 2
+; FORCE_REDUCTION-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+; FORCE_REDUCTION-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP6]], <i32 0, i32 0, i32 55, i32 285, i32 1240, i32 1496, i32 8555, i32 12529>
+; FORCE_REDUCTION-NEXT:    [[VAL_41:%.*]] = add i32 [[LOCAL_8_43_US1]], 13685
 ; FORCE_REDUCTION-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]])
-; FORCE_REDUCTION-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP0:%.*]], [[TMP4]]
-; FORCE_REDUCTION-NEXT:    [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[LOCAL_8_43_US]]
-; FORCE_REDUCTION-NEXT:    [[VAL_43]] = add i32 [[LOCAL_8_43_US]], 14910
-; FORCE_REDUCTION-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[OP_RDX1]], i32 0
-; FORCE_REDUCTION-NEXT:    [[TMP6]] = insertelement <2 x i32> [[TMP5]], i32 [[VAL_43]], i32 1
+; FORCE_REDUCTION-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP4]], [[VAL_41]]
+; FORCE_REDUCTION-NEXT:    [[VAL_43]] = and i32 [[OP_RDX]], [[LOCAL_8_43_US1]]
+; FORCE_REDUCTION-NEXT:    [[VAL_44]] = add i32 [[LOCAL_8_43_US1]], 14910
 ; FORCE_REDUCTION-NEXT:    br label [[LOOP]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/multi-tracked-reduced-value.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi-tracked-reduced-value.ll
index 79c6c6b3f046f..d7c63457bf5c1 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/multi-tracked-reduced-value.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/multi-tracked-reduced-value.ll
@@ -8,11 +8,11 @@ define i8 @test() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 0 to i8
 ; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 0 to i8
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc i32 0 to i8
+; CHECK-NEXT:    [[TMP4:%.*]] = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> zeroinitializer)
 ; CHECK-NEXT:    [[OP_RDX1:%.*]] = or i8 [[TMP0]], [[TMP2]]
 ; CHECK-NEXT:    [[OP_RDX2:%.*]] = or i8 [[OP_RDX1]], [[TMP0]]
 ; CHECK-NEXT:    [[OP_RDX3:%.*]] = or i8 [[OP_RDX2]], [[TMP1]]
 ; CHECK-NEXT:    [[OP_RDX5:%.*]] = or i8 [[OP_RDX3]], [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> zeroinitializer)
 ; CHECK-NEXT:    [[OP_RDX4:%.*]] = or i8 [[OP_RDX5]], [[TMP4]]
 ; CHECK-NEXT:    ret i8 [[OP_RDX4]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-extracted-value.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-extracted-value.ll
index 1cf837df719ec..56919ae0ffc90 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-extracted-value.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-extracted-value.ll
@@ -10,9 +10,9 @@ define i32 @foo() {
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP3]], zeroinitializer
 ; CHECK-NEXT:    [[RDX_OP:%.*]] = mul <4 x i32> [[TMP4]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[RDX_OP]])
+; CHECK-NEXT:    [[OP_RDX7:%.*]] = mul i32 0, [[TMP2]]
 ; CHECK-NEXT:    [[OP_RDX:%.*]] = mul i32 0, [[TMP5]]
-; CHECK-NEXT:    [[OP_RDX1:%.*]] = mul i32 [[OP_RDX]], 0
-; CHECK-NEXT:    [[OP_RDX6:%.*]] = mul i32 [[OP_RDX1]], [[TMP2]]
+; CHECK-NEXT:    [[OP_RDX6:%.*]] = mul i32 [[OP_RDX7]], [[OP_RDX]]
 ; CHECK-NEXT:    ret i32 [[OP_RDX6]]
 ;
 bb:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-value-in-tree.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-value-in-tree.ll
index 6d6dd502415e5..dd3dee6910a4c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-value-in-tree.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-value-in-tree.ll
@@ -6,7 +6,7 @@ define void @test() {
 ; CHECK-NEXT:    br i1 false, label [[PH:%.*]], label [[EXIT:%.*]]
 ; CHECK:       ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> zeroinitializer)
-; CHECK-NEXT:    [[OP_RDX:%.*]] = and i8 0, [[TMP0]]
+; CHECK-NEXT:    [[OP_RDX:%.*]] = and i8 [[TMP0]], 0
 ; CHECK-NEXT:    [[OP_RDX1:%.*]] = and i8 [[OP_RDX]], 0
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll b/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll
index f7811aba5ab5f..2774d5f3b64e4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll
@@ -7,16 +7,17 @@ define <4 x i16> @test() {
 ; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i16> zeroinitializer, <4 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP1:%.*]] = add <16 x i16> [[TMP0]], zeroinitializer
 ; CHECK-NEXT:    [[RDX_OP:%.*]] = or <16 x i16> zeroinitializer, [[TMP1]]
-; CHECK-NEXT:    [[TMP36:%.*]] = shufflevector <16 x i16> [[RDX_OP]], <16 x i16> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:    [[RDX_OP1:%.*]] = or <16 x i16> [[RDX_OP]], zeroinitializer
+; CHECK-NEXT:    [[TMP36:%.*]] = shufflevector <16 x i16> [[RDX_OP1]], <16 x i16> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
 ; CHECK-NEXT:    [[TMP37:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP36]])
 ; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i16> poison, i16 [[TMP37]], i64 0
-; CHECK-NEXT:    [[TMP39:%.*]] = shufflevector <16 x i16> [[RDX_OP]], <16 x i16> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-NEXT:    [[TMP39:%.*]] = shufflevector <16 x i16> [[RDX_OP1]], <16 x i16> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
 ; CHECK-NEXT:    [[TMP40:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP39]])
 ; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <4 x i16> [[TMP38]], i16 [[TMP40]], i64 1
-; CHECK-NEXT:    [[TMP42:%.*]] = shufflevector <16 x i16> [[RDX_OP]], <16 x i16> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-NEXT:    [[TMP42:%.*]] = shufflevector <16 x i16> [[RDX_OP1]], <16 x i16> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
 ; CHECK-NEXT:    [[TMP43:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP42]])
 ; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i16> [[TMP41]], i16 [[TMP43]], i64 2
-; CHECK-NEXT:    [[TMP45:%.*]] = shufflevector <16 x i16> [[RDX_OP]], <16 x i16> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:    [[TMP45:%.*]] = shufflevector <16 x i16> [[RDX_OP1]], <16 x i16> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
 ; CHECK-NEXT:    [[TMP46:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP45]])
 ; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x i16> [[TMP44]], i16 [[TMP46]], i64 3
 ; CHECK-NEXT:    [[OP_RDX9:%.*]] = or <4 x i16> [[TMP47]], zeroinitializer
diff --git a/llvm/test/Transforms/SLPVectorizer/operand-is-reduced-val.ll b/llvm/test/Transforms/SLPVectorizer/operand-is-reduced-val.ll
index 62417268bf3d0..7ed1edc278806 100644
--- a/llvm/test/Transforms/SLPVectorizer/operand-is-reduced-val.ll
+++ b/llvm/test/Transforms/SLPVectorizer/operand-is-reduced-val.ll
@@ -1,27 +1,41 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux < %s -slp-threshold=-10 | FileCheck %s %}
-; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux < %s -slp-threshold=-10 | FileCheck %s %}
+; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux < %s | FileCheck %s --check-prefix=X86 %}
+; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux < %s | FileCheck %s --check-prefix=AARCH64 %}
 
 define i64 @src(i32 %a) {
-; CHECK-LABEL: define i64 @src(
-; CHECK-SAME: i32 [[A:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP17:%.*]] = sext i32 [[A]] to i64
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64>
-; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i64> [[TMP3]], splat (i64 4294967297)
-; CHECK-NEXT:    [[TMP6:%.*]] = and <4 x i64> [[TMP4]], splat (i64 1)
-; CHECK-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP6]])
-; CHECK-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP4]])
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> poison, i64 [[TMP16]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> [[TMP8]], i64 [[TMP18]], i32 1
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> <i64 poison, i64 4294967297>, i64 [[TMP17]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = add <2 x i64> [[TMP9]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
-; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[TMP12]], [[TMP13]]
-; CHECK-NEXT:    ret i64 [[TMP21]]
+; X86-LABEL: define i64 @src(
+; X86-SAME: i32 [[A:%.*]]) {
+; X86-NEXT:  [[ENTRY:.*:]]
+; X86-NEXT:    [[TMP0:%.*]] = sext i32 [[A]] to i64
+; X86-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i32 0
+; X86-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
+; X86-NEXT:    [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64>
+; X86-NEXT:    [[TMP4:%.*]] = add nsw <4 x i64> [[TMP3]], splat (i64 4294967297)
+; X86-NEXT:    [[TMP5:%.*]] = and <4 x i64> [[TMP4]], splat (i64 1)
+; X86-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; X86-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; X86-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i64> [[TMP6]], <8 x i64> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; X86-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP8]])
+; X86-NEXT:    [[OP_RDX:%.*]] = add i64 [[TMP9]], 4294967297
+; X86-NEXT:    [[OP_RDX1:%.*]] = add i64 [[OP_RDX]], [[TMP0]]
+; X86-NEXT:    ret i64 [[OP_RDX1]]
+;
+; AARCH64-LABEL: define i64 @src(
+; AARCH64-SAME: i32 [[A:%.*]]) {
+; AARCH64-NEXT:  [[ENTRY:.*:]]
+; AARCH64-NEXT:    [[TMP0:%.*]] = sext i32 [[A]] to i64
+; AARCH64-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i32 0
+; AARCH64-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
+; AARCH64-NEXT:    [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64>
+; AARCH64-NEXT:    [[TMP4:%.*]] = add nsw <4 x i64> [[TMP3]], splat (i64 4294967297)
+; AARCH64-NEXT:    [[TMP5:%.*]] = and <4 x i64> [[TMP4]], splat (i64 1)
+; AARCH64-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AARCH64-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AARCH64-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AARCH64-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP8]])
+; AARCH64-NEXT:    [[OP_RDX:%.*]] = add i64 [[TMP9]], 4294967297
+; AARCH64-NEXT:    [[OP_RDX1:%.*]] = add i64 [[OP_RDX]], [[TMP0]]
+; AARCH64-NEXT:    ret i64 [[OP_RDX1]]
 ;
 entry:
   %0 = sext i32 %a to i64

github-actions · 2025-08-14T14:55:43Z

✅ With the latest revision this PR passed the C/C++ code formatter.

Created using spr 1.3.5

alexey-bataev · 2025-09-12T10:59:50Z

Ping!

RKSimon

please can you update against trunk?

Created using spr 1.3.7

RKSimon · 2025-10-17T16:40:43Z

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

    SmallVector<InstructionsState> States;
-    for (ArrayRef<Value *> RV : ReducedVals)
+    SmallVector<SmallVector<Value *>> LocalReducedVals;
+    for (ArrayRef<Value *> RV : ReducedVals) {


Created using spr 1.3.7

[𝘀𝗽𝗿] initial version

4eb3b66

Created using spr 1.3.5

llvmbot added vectorizers llvm:transforms labels Aug 14, 2025

alexey-bataev requested review from RKSimon and hiraditya August 14, 2025 14:53

Fix formatting

ccd9973

Created using spr 1.3.5

RKSimon requested changes Oct 14, 2025

View reviewed changes

Rebase

132070b

Created using spr 1.3.7

llvmbot added the backend:RISC-V label Oct 17, 2025

RKSimon reviewed Oct 17, 2025

View reviewed changes

Added a comment

b212f20

Created using spr 1.3.7

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[SLP] Support for copyables in the reduced values #153589

[SLP] Support for copyables in the reduced values #153589

Uh oh!

alexey-bataev commented Aug 14, 2025

Uh oh!

llvmbot commented Aug 14, 2025 •

edited

Loading

Uh oh!

github-actions bot commented Aug 14, 2025 •

edited

Loading

Uh oh!

alexey-bataev commented Sep 12, 2025

Uh oh!

RKSimon left a comment

Uh oh!

RKSimon Oct 17, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

[SLP] Support for copyables in the reduced values #153589

Are you sure you want to change the base?

[SLP] Support for copyables in the reduced values #153589

Uh oh!

Conversation

alexey-bataev commented Aug 14, 2025

Uh oh!

llvmbot commented Aug 14, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

github-actions bot commented Aug 14, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

alexey-bataev commented Sep 12, 2025

Uh oh!

RKSimon left a comment

Choose a reason for hiding this comment

Uh oh!

RKSimon Oct 17, 2025

Choose a reason for hiding this comment

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

llvmbot commented Aug 14, 2025 •

edited

Loading

github-actions bot commented Aug 14, 2025 •

edited

Loading