Skip to content

Commit 9a9164f

Browse files
committed
[LV] Reduce register usage for scaled reductions
1 parent 57ffaea commit 9a9164f

File tree

5 files changed

+282
-221
lines changed

5 files changed

+282
-221
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5019,10 +5019,23 @@ calculateRegisterUsage(VPlan &Plan, ArrayRef<ElementCount> VFs,
50195019
// even in the scalar case.
50205020
RegUsage[ClassID] += 1;
50215021
} else {
5022+
// The output from scaled phis and scaled reductions actually have
5023+
// fewer lanes than the VF.
5024+
auto VF = VFs[J];
5025+
if (auto *ReductionR = dyn_cast<VPReductionPHIRecipe>(R))
5026+
VF = VF.divideCoefficientBy(ReductionR->getVFScaleFactor());
5027+
else if (auto *PartialReductionR =
5028+
dyn_cast<VPPartialReductionRecipe>(R))
5029+
VF = VF.divideCoefficientBy(PartialReductionR->getScaleFactor());
5030+
if (VF != VFs[J])
5031+
LLVM_DEBUG(dbgs() << "LV(REG): Scaled down VF from " << VFs[J]
5032+
<< " to " << VF << " for ";
5033+
R->dump(););
5034+
50225035
for (VPValue *DefV : R->definedValues()) {
50235036
Type *ScalarTy = TypeInfo.inferScalarType(DefV);
50245037
unsigned ClassID = TTI.getRegisterClassForType(true, ScalarTy);
5025-
RegUsage[ClassID] += GetRegUsage(ScalarTy, VFs[J]);
5038+
RegUsage[ClassID] += GetRegUsage(ScalarTy, VF);
50265039
}
50275040
}
50285041
}
@@ -8951,8 +8964,8 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(
89518964
if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
89528965
return tryToWidenMemory(Instr, Operands, Range);
89538966

8954-
if (getScalingForReduction(Instr))
8955-
return tryToCreatePartialReduction(Instr, Operands);
8967+
if (auto ScaleFactor = getScalingForReduction(Instr))
8968+
return tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value());
89568969

89578970
if (!shouldWiden(Instr, Range))
89588971
return nullptr;
@@ -8976,7 +8989,8 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(
89768989

89778990
VPRecipeBase *
89788991
VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
8979-
ArrayRef<VPValue *> Operands) {
8992+
ArrayRef<VPValue *> Operands,
8993+
unsigned ScaleFactor) {
89808994
assert(Operands.size() == 2 &&
89818995
"Unexpected number of operands for partial reduction");
89828996

@@ -9009,7 +9023,7 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
90099023
BinOp = Builder.createSelect(Mask, BinOp, Zero, Reduction->getDebugLoc());
90109024
}
90119025
return new VPPartialReductionRecipe(ReductionOpcode, BinOp, Accumulator,
9012-
Reduction);
9026+
ScaleFactor, Reduction);
90139027
}
90149028

90159029
void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,

llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,8 @@ class VPRecipeBuilder {
178178
/// Create and return a partial reduction recipe for a reduction instruction
179179
/// along with binary operation and reduction phi operands.
180180
VPRecipeBase *tryToCreatePartialReduction(Instruction *Reduction,
181-
ArrayRef<VPValue *> Operands);
181+
ArrayRef<VPValue *> Operands,
182+
unsigned ScaleFactor);
182183

183184
/// Set the recipe created for given ingredient.
184185
void setRecipe(Instruction *I, VPRecipeBase *R) {

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2033,6 +2033,8 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
20332033
/// Generate the phi/select nodes.
20342034
void execute(VPTransformState &State) override;
20352035

2036+
unsigned getVFScaleFactor() const { return VFScaleFactor; }
2037+
20362038
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
20372039
/// Print the recipe.
20382040
void print(raw_ostream &O, const Twine &Indent,
@@ -2063,17 +2065,19 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
20632065
/// scalar value.
20642066
class VPPartialReductionRecipe : public VPSingleDefRecipe {
20652067
unsigned Opcode;
2068+
unsigned ScaleFactor;
20662069

20672070
public:
20682071
VPPartialReductionRecipe(Instruction *ReductionInst, VPValue *Op0,
2069-
VPValue *Op1)
2072+
VPValue *Op1, unsigned ScaleFactor)
20702073
: VPPartialReductionRecipe(ReductionInst->getOpcode(), Op0, Op1,
2071-
ReductionInst) {}
2074+
ScaleFactor, ReductionInst) {}
20722075
VPPartialReductionRecipe(unsigned Opcode, VPValue *Op0, VPValue *Op1,
2076+
unsigned ScaleFactor,
20732077
Instruction *ReductionInst = nullptr)
20742078
: VPSingleDefRecipe(VPDef::VPPartialReductionSC,
20752079
ArrayRef<VPValue *>({Op0, Op1}), ReductionInst),
2076-
Opcode(Opcode) {
2080+
Opcode(Opcode), ScaleFactor(ScaleFactor) {
20772081
[[maybe_unused]] auto *AccumulatorRecipe =
20782082
getOperand(1)->getDefiningRecipe();
20792083
assert((isa<VPReductionPHIRecipe>(AccumulatorRecipe) ||
@@ -2084,7 +2088,7 @@ class VPPartialReductionRecipe : public VPSingleDefRecipe {
20842088

20852089
VPPartialReductionRecipe *clone() override {
20862090
return new VPPartialReductionRecipe(Opcode, getOperand(0), getOperand(1),
2087-
getUnderlyingInstr());
2091+
ScaleFactor, getUnderlyingInstr());
20882092
}
20892093

20902094
VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC)
@@ -2099,6 +2103,8 @@ class VPPartialReductionRecipe : public VPSingleDefRecipe {
20992103
/// Get the binary op's opcode.
21002104
unsigned getOpcode() const { return Opcode; }
21012105

2106+
unsigned getScaleFactor() const { return ScaleFactor; }
2107+
21022108
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
21032109
/// Print the recipe.
21042110
void print(raw_ostream &O, const Twine &Indent,

llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll

Lines changed: 79 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -770,10 +770,10 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
770770
; CHECK-INTERLEAVED-LABEL: define i32 @dotp_unrolled(
771771
; CHECK-INTERLEAVED-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
772772
; CHECK-INTERLEAVED-NEXT: entry:
773-
; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 16
773+
; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 32
774774
; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
775775
; CHECK-INTERLEAVED: vector.ph:
776-
; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 16
776+
; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 32
777777
; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
778778
; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
779779
; CHECK-INTERLEAVED: vector.body:
@@ -782,6 +782,10 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
782782
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ]
783783
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ]
784784
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
785+
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE16:%.*]], [[VECTOR_BODY]] ]
786+
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE17:%.*]], [[VECTOR_BODY]] ]
787+
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE1:%.*]], [[VECTOR_BODY]] ]
788+
; CHECK-INTERLEAVED-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE11:%.*]], [[VECTOR_BODY]] ]
785789
; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
786790
; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
787791
; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = or disjoint i64 [[INDEX]], 1
@@ -794,45 +798,81 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
794798
; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
795799
; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
796800
; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
801+
; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16
797802
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1
798-
; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
799-
; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
800-
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP13]], align 1
801-
; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
802-
; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = mul nsw <16 x i32> [[TMP14]], [[TMP36]]
803-
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP15]])
804-
; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
805-
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP16]], align 1
806-
; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
807-
; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
808-
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP18]], align 1
809-
; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
810-
; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP19]]
811-
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP20]])
812-
; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
813-
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP21]], align 1
814-
; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
815-
; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
816-
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP23]], align 1
817-
; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
818-
; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = mul nsw <16 x i32> [[TMP22]], [[TMP24]]
819-
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP25]])
820-
; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
821-
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP26]], align 1
822-
; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
823-
; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
824-
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP28]], align 1
825-
; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
826-
; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = mul nsw <16 x i32> [[TMP27]], [[TMP29]]
827-
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP30]])
828-
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
829-
; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
830-
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
803+
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP38]], align 1
804+
; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
805+
; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
806+
; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
807+
; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 16
808+
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP15]], align 1
809+
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD10:%.*]] = load <16 x i8>, ptr [[TMP16]], align 1
810+
; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
811+
; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD10]] to <16 x i32>
812+
; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP13]]
813+
; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP14]]
814+
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE1]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP19]])
815+
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE11]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP20]])
816+
; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
817+
; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 16
818+
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP21]], align 1
819+
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD13:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1
820+
; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
821+
; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = sext <16 x i8> [[WIDE_LOAD13]] to <16 x i32>
822+
; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
823+
; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 16
824+
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD14:%.*]] = load <16 x i8>, ptr [[TMP25]], align 1
825+
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD15:%.*]] = load <16 x i8>, ptr [[TMP26]], align 1
826+
; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = sext <16 x i8> [[WIDE_LOAD14]] to <16 x i32>
827+
; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD15]] to <16 x i32>
828+
; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP27]]
829+
; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = mul nsw <16 x i32> [[TMP24]], [[TMP28]]
830+
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP29]])
831+
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP30]])
832+
; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
833+
; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 16
834+
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD18:%.*]] = load <16 x i8>, ptr [[TMP31]], align 1
835+
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD19:%.*]] = load <16 x i8>, ptr [[TMP32]], align 1
836+
; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = sext <16 x i8> [[WIDE_LOAD18]] to <16 x i32>
837+
; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = sext <16 x i8> [[WIDE_LOAD19]] to <16 x i32>
838+
; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
839+
; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 16
840+
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD20:%.*]] = load <16 x i8>, ptr [[TMP35]], align 1
841+
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD21:%.*]] = load <16 x i8>, ptr [[TMP36]], align 1
842+
; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = sext <16 x i8> [[WIDE_LOAD20]] to <16 x i32>
843+
; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = sext <16 x i8> [[WIDE_LOAD21]] to <16 x i32>
844+
; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = mul nsw <16 x i32> [[TMP33]], [[TMP37]]
845+
; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = mul nsw <16 x i32> [[TMP34]], [[TMP56]]
846+
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP39]])
847+
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP40]])
848+
; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
849+
; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 16
850+
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD24:%.*]] = load <16 x i8>, ptr [[TMP41]], align 1
851+
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD25:%.*]] = load <16 x i8>, ptr [[TMP42]], align 1
852+
; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = sext <16 x i8> [[WIDE_LOAD24]] to <16 x i32>
853+
; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = sext <16 x i8> [[WIDE_LOAD25]] to <16 x i32>
854+
; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
855+
; CHECK-INTERLEAVED-NEXT: [[TMP46:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 16
856+
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD26:%.*]] = load <16 x i8>, ptr [[TMP45]], align 1
857+
; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD27:%.*]] = load <16 x i8>, ptr [[TMP46]], align 1
858+
; CHECK-INTERLEAVED-NEXT: [[TMP47:%.*]] = sext <16 x i8> [[WIDE_LOAD26]] to <16 x i32>
859+
; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = sext <16 x i8> [[WIDE_LOAD27]] to <16 x i32>
860+
; CHECK-INTERLEAVED-NEXT: [[TMP49:%.*]] = mul nsw <16 x i32> [[TMP43]], [[TMP47]]
861+
; CHECK-INTERLEAVED-NEXT: [[TMP50:%.*]] = mul nsw <16 x i32> [[TMP44]], [[TMP48]]
862+
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP49]])
863+
; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP50]])
864+
; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
865+
; CHECK-INTERLEAVED-NEXT: [[TMP51:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
866+
; CHECK-INTERLEAVED-NEXT: br i1 [[TMP51]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
831867
; CHECK-INTERLEAVED: middle.block:
832-
; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE13]])
833-
; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE10]])
834-
; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE7]])
835-
; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
868+
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE10]], [[PARTIAL_REDUCE13]]
869+
; CHECK-INTERLEAVED-NEXT: [[TMP52:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
870+
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX30:%.*]] = add <4 x i32> [[PARTIAL_REDUCE]], [[PARTIAL_REDUCE7]]
871+
; CHECK-INTERLEAVED-NEXT: [[TMP53:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX30]])
872+
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX31:%.*]] = add <4 x i32> [[PARTIAL_REDUCE17]], [[PARTIAL_REDUCE16]]
873+
; CHECK-INTERLEAVED-NEXT: [[TMP54:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX31]])
874+
; CHECK-INTERLEAVED-NEXT: [[BIN_RDX32:%.*]] = add <4 x i32> [[PARTIAL_REDUCE11]], [[PARTIAL_REDUCE1]]
875+
; CHECK-INTERLEAVED-NEXT: [[TMP55:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX32]])
836876
; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]]
837877
; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
838878
; CHECK-INTERLEAVED: scalar.ph:

0 commit comments

Comments
 (0)