Skip to content

Commit 4ef6a5e

Browse files
committed
[LV] Don't create partial reductions if factor doesn't match accumulator
Check if the scale-factor of the accumulator is the same as the request ScaleFactor in tryToCreatePartialReductions. This prevents creating partial reductions if not all instructions in the reduction chain form partial reductions. e.g. because we do not form a partial reduction for the loop exit instruction. Currently code-gen works fine, because the scale factor of VPPartialReduction is not used during ::execute, but it means we compute incorrect cost/register pressure, because the partial reduction won't reduce to the specified scaling factor.
1 parent 2848e28 commit 4ef6a5e

File tree

5 files changed

+30
-19
lines changed

5 files changed

+30
-19
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8141,8 +8141,11 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R,
81418141
if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
81428142
return tryToWidenMemory(Instr, Operands, Range);
81438143

8144-
if (std::optional<unsigned> ScaleFactor = getScalingForReduction(Instr))
8145-
return tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value());
8144+
if (std::optional<unsigned> ScaleFactor = getScalingForReduction(Instr)) {
8145+
if (auto PartialRed =
8146+
tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value()))
8147+
return PartialRed;
8148+
}
81468149

81478150
if (!shouldWiden(Instr, Range))
81488151
return nullptr;
@@ -8176,6 +8179,10 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
81768179
isa<VPPartialReductionRecipe>(BinOpRecipe))
81778180
std::swap(BinOp, Accumulator);
81788181

8182+
if (ScaleFactor !=
8183+
vputils::getVFScaleFactor(Accumulator->getDefiningRecipe()))
8184+
return nullptr;
8185+
81798186
unsigned ReductionOpcode = Reduction->getOpcode();
81808187
if (ReductionOpcode == Instruction::Sub) {
81818188
auto *const Zero = ConstantInt::get(Reduction->getType(), 0);

llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp

Lines changed: 1 addition & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -394,20 +394,6 @@ bool VPDominatorTree::properlyDominates(const VPRecipeBase *A,
394394
return Base::properlyDominates(ParentA, ParentB);
395395
}
396396

397-
/// Get the VF scaling factor applied to the recipe's output, if the recipe has
398-
/// one.
399-
static unsigned getVFScaleFactor(VPRecipeBase *R) {
400-
if (auto *RR = dyn_cast<VPReductionPHIRecipe>(R))
401-
return RR->getVFScaleFactor();
402-
if (auto *RR = dyn_cast<VPPartialReductionRecipe>(R))
403-
return RR->getVFScaleFactor();
404-
assert(
405-
(!isa<VPInstruction>(R) || cast<VPInstruction>(R)->getOpcode() !=
406-
VPInstruction::ReductionStartVector) &&
407-
"getting scaling factor of reduction-start-vector not implemented yet");
408-
return 1;
409-
}
410-
411397
bool VPRegisterUsage::exceedsMaxNumRegs(const TargetTransformInfo &TTI,
412398
unsigned OverrideMaxNumRegs) const {
413399
return any_of(MaxLocalUsers, [&TTI, &OverrideMaxNumRegs](auto &LU) {
@@ -569,7 +555,7 @@ SmallVector<VPRegisterUsage, 8> llvm::calculateRegisterUsageForPlan(
569555
} else {
570556
// The output from scaled phis and scaled reductions actually has
571557
// fewer lanes than the VF.
572-
unsigned ScaleFactor = getVFScaleFactor(R);
558+
unsigned ScaleFactor = vputils::getVFScaleFactor(R);
573559
ElementCount VF = VFs[J].divideCoefficientBy(ScaleFactor);
574560
LLVM_DEBUG(if (VF != VFs[J]) {
575561
dbgs() << "LV(REG): Scaled down VF from " << VFs[J] << " to " << VF

llvm/lib/Transforms/Vectorize/VPlanUtils.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,3 +141,17 @@ VPBasicBlock *vputils::getFirstLoopHeader(VPlan &Plan, VPDominatorTree &VPDT) {
141141
});
142142
return I == DepthFirst.end() ? nullptr : cast<VPBasicBlock>(*I);
143143
}
144+
145+
unsigned vputils::getVFScaleFactor(VPRecipeBase *R) {
146+
if (!R)
147+
return 1;
148+
if (auto *RR = dyn_cast<VPReductionPHIRecipe>(R))
149+
return RR->getVFScaleFactor();
150+
if (auto *RR = dyn_cast<VPPartialReductionRecipe>(R))
151+
return RR->getVFScaleFactor();
152+
assert(
153+
(!isa<VPInstruction>(R) || cast<VPInstruction>(R)->getOpcode() !=
154+
VPInstruction::ReductionStartVector) &&
155+
"getting scaling factor of reduction-start-vector not implemented yet");
156+
return 1;
157+
}

llvm/lib/Transforms/Vectorize/VPlanUtils.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,10 @@ bool isUniformAcrossVFsAndUFs(VPValue *V);
101101
/// Returns the header block of the first, top-level loop, or null if none
102102
/// exist.
103103
VPBasicBlock *getFirstLoopHeader(VPlan &Plan, VPDominatorTree &VPDT);
104+
105+
/// Get the VF scaling factor applied to the recipe's output, if the recipe has
106+
/// one.
107+
unsigned getVFScaleFactor(VPRecipeBase *R);
104108
} // namespace vputils
105109

106110
//===----------------------------------------------------------------------===//

llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1404,7 +1404,7 @@ define i32 @red_extended_add_chain(ptr %start, ptr %end, i32 %offset) {
14041404
; CHECK-NEON-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]]
14051405
; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
14061406
; CHECK-NEON-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
1407-
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <16 x i32> @llvm.experimental.vector.partial.reduce.add.v16i32.v16i32(<16 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]])
1407+
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = add <16 x i32> [[VEC_PHI]], [[TMP3]]
14081408
; CHECK-NEON-NEXT: [[TMP4]] = add <16 x i32> [[PARTIAL_REDUCE]], [[BROADCAST_SPLAT]]
14091409
; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
14101410
; CHECK-NEON-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1478,7 +1478,7 @@ define i32 @red_extended_add_chain(ptr %start, ptr %end, i32 %offset) {
14781478
; CHECK-SVE-MAXBW-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]]
14791479
; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[NEXT_GEP]], align 1
14801480
; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
1481-
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 8 x i32> @llvm.experimental.vector.partial.reduce.add.nxv8i32.nxv8i32(<vscale x 8 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP7]])
1481+
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = add <vscale x 8 x i32> [[VEC_PHI]], [[TMP7]]
14821482
; CHECK-SVE-MAXBW-NEXT: [[TMP8]] = add <vscale x 8 x i32> [[PARTIAL_REDUCE]], [[BROADCAST_SPLAT]]
14831483
; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
14841484
; CHECK-SVE-MAXBW-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]

0 commit comments

Comments
 (0)