Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5679,6 +5679,18 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost(
if (CostKind != TTI::TCK_RecipThroughput)
return Invalid;

unsigned Ratio =
AccumType->getScalarSizeInBits() / InputTypeA->getScalarSizeInBits();

// A ratio of 1 would mean it's similar to a regular add, e.g.
// v4i64 partial.reduce(v4i64 %acc, v4i64 %vec)
// <=> add v4i64 %acc, %vec
if (Ratio == 1) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

rather than checking for ratio=1, you should either replace

  1. https://github.com/llvm/llvm-project/blob/main/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp#L323 with getArithmeticInstructionCost (I believe this line is simply checking for the simple add reduction)

OR

  1. https://github.com/llvm/llvm-project/blob/main/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp#L3552 with getAirthmeticInstructionCost

auto *T = VectorType::get(AccumType, VF);
return getArithmeticInstrCost(Opcode, T, CostKind) +
(BinOp ? getArithmeticInstrCost(*BinOp, T, CostKind) : 0);
}

if (VF.isFixed() && !ST->isSVEorStreamingSVEAvailable() &&
(!ST->isNeonAvailable() || !ST->hasDotProd()))
return Invalid;
Expand All @@ -5700,8 +5712,6 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost(
if (IsUSDot && !ST->hasMatMulInt8())
return Invalid;

unsigned Ratio =
AccumType->getScalarSizeInBits() / InputTypeA->getScalarSizeInBits();
if (VF.getKnownMinValue() <= Ratio)
return Invalid;

Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -2378,6 +2378,8 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
/// Get the factor that the VF of this recipe's output should be scaled by.
unsigned getVFScaleFactor() const { return VFScaleFactor; }

void setVFScaleFactor(unsigned F) { VFScaleFactor = F; }

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe.
void print(raw_ostream &O, const Twine &Indent,
Expand Down
67 changes: 63 additions & 4 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@
using namespace llvm;
using namespace VPlanPatternMatch;

#define DEBUG_TYPE "loop-vectorize"

static cl::opt<bool> EnableWideActiveLaneMask(
"enable-wide-lane-mask", cl::init(false), cl::Hidden,
cl::desc("Enable use of wide get active lane mask instructions"));
Expand Down Expand Up @@ -3761,7 +3763,7 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,

/// This function tries to create abstract recipes from the reduction recipe for
/// following optimizations and cost estimation.
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red,
static bool tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red,
VPCostContext &Ctx,
VFRange &Range) {
VPExpressionRecipe *AbstractR = nullptr;
Expand All @@ -3773,19 +3775,76 @@ static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red,
AbstractR = ExtRed;
// Cannot create abstract inloop reduction recipes.
if (!AbstractR)
return;
return false;

AbstractR->insertBefore(*VPBB, IP);
Red->replaceAllUsesWith(AbstractR);
return true;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need to add Red->eraseFromParent(); before this ?

}

/// Lower a partial reduction back to a regular reduction, by
/// changing the in-loop partial reduction to a binop and removing
/// the scale factor from the PHI node.
static void lowerPartialReduction(VPlan &Plan, VPPartialReductionRecipe *Red,
VPCostContext &Ctx) {
VPRecipeBase *Acc = Red->getChainOp()->getDefiningRecipe();
if (auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Acc)) {
PhiR->setVFScaleFactor(1);

// We also need to update the scale factor of the reduction-start-vector
// operand.
VPValue *StartV, *IdentityV;
if (!match(PhiR->getOperand(0),
m_VPInstruction<VPInstruction::ReductionStartVector>(
m_VPValue(StartV), m_VPValue(IdentityV), m_VPValue())))
llvm_unreachable("Unexpected operand for a partial reduction");
Type *I32Ty = IntegerType::getInt32Ty(Plan.getContext());
auto *ScaleFactorVPV = Plan.getOrAddLiveIn(ConstantInt::get(I32Ty, 1));
cast<VPInstruction>(PhiR->getOperand(0))->setOperand(2, ScaleFactorVPV);
}

if (auto *R = dyn_cast<VPPartialReductionRecipe>(Acc))
if (R->getVFScaleFactor() != 1)
lowerPartialReduction(Plan, R, Ctx);

LLVM_DEBUG(
dbgs() << "LV: Lowering " << *Red
<< " back to regular reduction, because it is not profitable\n");
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we want an a before regular.


// Lower the partial reduction to a regular binop.
VPBuilder Builder(Red);
VPInstruction *Add = Builder.createNaryOp(
RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
{Red->getChainOp(), Red->getVecOp()});
if (Red->isConditional())
Add = Builder.createSelect(Red->getCondOp(), Add, Red->getChainOp());

Red->replaceAllUsesWith(Add);
Red->eraseFromParent();
}

void VPlanTransforms::convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx,
VFRange &Range) {
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
vp_depth_first_deep(Plan.getVectorLoopRegion()))) {
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
if (auto *Red = dyn_cast<VPReductionRecipe>(&R))
tryToCreateAbstractReductionRecipe(Red, Ctx, Range);
auto *Red = dyn_cast<VPReductionRecipe>(&R);
if (!Red)
continue;

if (!tryToCreateAbstractReductionRecipe(Red, Ctx, Range) &&
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If 'Red' is converted to AbstractRecipe, Red should be null. Can you check for Red instead of returning bool from the tryToCreateAbstractReductionRecipe ?

isa<VPPartialReductionRecipe>(Red)) {
// If there isn't a profitable VPExpression for a partial reduction,
// then that suggests using a partial reduction is not profitable
// for this VPlan. It seems better to resort to a regular (middle-block)
// reduction, so that the this plan is still profitable to consider.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Extra the here.

// Otherwise, the plan might be discarded in favour of a smaller VF.
//
// FIXME: There's a lot to unpick when it comes to partial
// reductions, but this should provide a temporary stop-gap until we
// reimplement the logic for creating partial reductions.
lowerPartialReduction(Plan, cast<VPPartialReductionRecipe>(Red), Ctx);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should only call this if the reduction actually is partial, otherwise we'll waste some time essentially doing nothing in the lower function.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it would be great if we can lower this here, basically after this

}
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -482,29 +482,29 @@ define i64 @partial_reduction_mul_two_users(i64 %n, ptr %a, i16 %b, i32 %c) {
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT]] to <8 x i32>
; CHECK-NEXT: [[TMP2:%.*]] = mul <8 x i32> [[TMP1]], [[TMP1]]
; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i32> [[TMP2]] to <8 x i64>
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr [[A]], align 2
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP4]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i32> [[TMP2]] to <8 x i64>
; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i64> @llvm.vector.partial.reduce.add.v4i64.v8i64(<4 x i64> [[VEC_PHI]], <8 x i64> [[TMP3]])
; CHECK-NEXT: [[TMP8]] = add <8 x i64> [[VEC_PHI]], [[TMP3]]
; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT2]] to <8 x i32>
; CHECK-NEXT: [[TMP6:%.*]] = sext <8 x i32> [[TMP5]] to <8 x i64>
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[PARTIAL_REDUCE]])
; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP8]])
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i64> [[TMP6]], i32 7
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
; CHECK: [[SCALAR_PH]]:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
Expand All @@ -520,9 +520,9 @@ define i64 @partial_reduction_mul_two_users(i64 %n, ptr %a, i16 %b, i32 %c) {
; CHECK-NEXT: [[LOAD_EXT:%.*]] = sext i16 [[LOAD]] to i32
; CHECK-NEXT: [[LOAD_EXT_EXT]] = sext i32 [[LOAD_EXT]] to i64
; CHECK-NEXT: [[EXITCOND740_NOT:%.*]] = icmp eq i64 [[IV]], [[N]]
; CHECK-NEXT: br i1 [[EXITCOND740_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP19:![0-9]+]]
; CHECK-NEXT: br i1 [[EXITCOND740_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], %[[LOOP]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[ADD_LCSSA]]
;
entry:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
; RUN: opt -S -mcpu=neoverse-v2 -passes=loop-vectorize -mtriple=aarch64 < %s | FileCheck %s
target triple = "aarch64"

; Check that a partial reduction is reverted back to a regular reduction,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you add some check statements that make sure that the reversion to a normal reduction happened? I think we'll also want a test with a subtract, as well as chained add and chained subtract.

; so that we compare "the VPlan with the best kind of reduction for <range>"
; vs "the VPlan with the best kind of reduction for <other range>",

; Function Attrs: nofree norecurse nosync nounwind memory(argmem: read) uwtable vscale_range(1,16)
define dso_local i64 @foo(ptr noundef readonly captures(none) %0, i32 noundef %1) local_unnamed_addr #0 {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you rename this test to something like revert_add and remove local_unnamed_addr #0?

; CHECK-LABEL: define dso_local i64 @foo(
; CHECK-SAME: ptr noundef readonly captures(none) [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt i32 [[TMP1]], 0
; CHECK-NEXT: br i1 [[TMP3]], label %[[ITER_CHECK:.*]], label %[[BB27:.*]]
; CHECK: [[ITER_CHECK]]:
; CHECK-NEXT: [[TMP4:%.*]] = zext nneg i32 [[TMP1]] to i64
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP4]], 4
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP4]], 16
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP4]], 16
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP4]], [[N_MOD_VF]]
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP0]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP5]], i32 4
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP5]], i32 8
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP5]], i32 12
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4
; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP6]], align 4
; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4
; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP8]], align 4
; CHECK-NEXT: [[TMP9:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
; CHECK-NEXT: [[TMP10:%.*]] = sext <4 x i32> [[WIDE_LOAD5]] to <4 x i64>
; CHECK-NEXT: [[TMP11:%.*]] = sext <4 x i32> [[WIDE_LOAD6]] to <4 x i64>
; CHECK-NEXT: [[TMP12:%.*]] = sext <4 x i32> [[WIDE_LOAD7]] to <4 x i64>
; CHECK-NEXT: [[TMP13]] = add <4 x i64> [[VEC_PHI]], [[TMP9]]
; CHECK-NEXT: [[TMP14]] = add <4 x i64> [[VEC_PHI2]], [[TMP10]]
; CHECK-NEXT: [[TMP15]] = add <4 x i64> [[VEC_PHI3]], [[TMP11]]
; CHECK-NEXT: [[TMP16]] = add <4 x i64> [[VEC_PHI4]], [[TMP12]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you dont need to check post middle block. You can use something like '--filter-out-after' when generating the test

; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i64> [[TMP14]], [[TMP13]]
; CHECK-NEXT: [[BIN_RDX8:%.*]] = add <4 x i64> [[TMP15]], [[BIN_RDX]]
; CHECK-NEXT: [[BIN_RDX9:%.*]] = add <4 x i64> [[TMP16]], [[BIN_RDX8]]
; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[BIN_RDX9]])
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label %[[BB25:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
; CHECK: [[VEC_EPILOG_ITER_CHECK]]:
; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
; CHECK: [[VEC_EPILOG_PH]]:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP18]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
; CHECK-NEXT: [[N_MOD_VF10:%.*]] = urem i64 [[TMP4]], 4
; CHECK-NEXT: [[N_VEC11:%.*]] = sub i64 [[TMP4]], [[N_MOD_VF10]]
; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0
; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
; CHECK: [[VEC_EPILOG_VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX12:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI13:%.*]] = phi <4 x i64> [ [[TMP19]], %[[VEC_EPILOG_PH]] ], [ [[TMP22:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP0]], i64 [[INDEX12]]
; CHECK-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x i32>, ptr [[TMP20]], align 4
; CHECK-NEXT: [[TMP21:%.*]] = sext <4 x i32> [[WIDE_LOAD14]] to <4 x i64>
; CHECK-NEXT: [[TMP22]] = add <4 x i64> [[VEC_PHI13]], [[TMP21]]
; CHECK-NEXT: [[INDEX_NEXT15]] = add nuw i64 [[INDEX12]], 4
; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT15]], [[N_VEC11]]
; CHECK-NEXT: br i1 [[TMP23]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]:
; CHECK-NEXT: [[TMP24:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP22]])
; CHECK-NEXT: [[CMP_N16:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC11]]
; CHECK-NEXT: br i1 [[CMP_N16]], label %[[BB25]], label %[[VEC_EPILOG_SCALAR_PH]]
; CHECK: [[VEC_EPILOG_SCALAR_PH]]:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC11]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
; CHECK-NEXT: [[BC_MERGE_RDX17:%.*]] = phi i64 [ [[TMP24]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP18]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
; CHECK-NEXT: br label %[[BB29:.*]]
; CHECK: [[BB25]]:
; CHECK-NEXT: [[TMP26:%.*]] = phi i64 [ [[TMP35:%.*]], %[[BB29]] ], [ [[TMP18]], %[[MIDDLE_BLOCK]] ], [ [[TMP24]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ]
; CHECK-NEXT: br label %[[BB27]]
; CHECK: [[BB27]]:
; CHECK-NEXT: [[TMP28:%.*]] = phi i64 [ 0, [[TMP2:%.*]] ], [ [[TMP26]], %[[BB25]] ]
; CHECK-NEXT: ret i64 [[TMP28]]
; CHECK: [[BB29]]:
; CHECK-NEXT: [[TMP30:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[TMP36:%.*]], %[[BB29]] ]
; CHECK-NEXT: [[TMP31:%.*]] = phi i64 [ [[BC_MERGE_RDX17]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[TMP35]], %[[BB29]] ]
; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP0]], i64 [[TMP30]]
; CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[TMP32]], align 4
; CHECK-NEXT: [[TMP34:%.*]] = sext i32 [[TMP33]] to i64
; CHECK-NEXT: [[TMP35]] = add i64 [[TMP31]], [[TMP34]]
; CHECK-NEXT: [[TMP36]] = add nuw nsw i64 [[TMP30]], 1
; CHECK-NEXT: [[TMP37:%.*]] = icmp eq i64 [[TMP36]], [[TMP4]]
; CHECK-NEXT: br i1 [[TMP37]], label %[[BB25]], label %[[BB29]], !llvm.loop [[LOOP5:![0-9]+]]
;
%3 = icmp sgt i32 %1, 0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you rename some of the labels and variables to have more meaningful names?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you can simplify the cfg here a little by removing the check for sgt 0.

br i1 %3, label %4, label %8

4: ; preds = %2
%5 = zext nneg i32 %1 to i64
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you just pass in a i64 as %1 instead? That way you can remove the zext. I don't think the test really relies upon it.

br label %10

6: ; preds = %10
%7 = phi i64 [ %16, %10 ]
br label %8

8: ; preds = %6, %2
%9 = phi i64 [ 0, %2 ], [ %7, %6 ]
ret i64 %9

10: ; preds = %4, %10
%11 = phi i64 [ 0, %4 ], [ %17, %10 ]
%12 = phi i64 [ 0, %4 ], [ %16, %10 ]
%13 = getelementptr inbounds nuw i32, ptr %0, i64 %11
%14 = load i32, ptr %13, align 4
%15 = sext i32 %14 to i64
%16 = add i64 %12, %15
%17 = add nuw nsw i64 %11, 1
%18 = icmp eq i64 %17, %5
br i1 %18, label %6, label %10
}
;.
; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
; CHECK: [[PROF3]] = !{!"branch_weights", i32 4, i32 12}
; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
;.
Comment on lines +130 to +136
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can add --check-globals none to the update script to avoid these being added.

Loading