Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,13 @@ class VPBuilder {
new VPInstruction(VPInstruction::PtrAdd, {Ptr, Offset},
GEPNoWrapFlags::inBounds(), DL, Name));
}
VPInstruction *createWidePtrAdd(VPValue *Ptr, VPValue *Offset,
DebugLoc DL = DebugLoc::getUnknown(),
const Twine &Name = "") {
return tryInsertInstruction(
new VPInstruction(VPInstruction::WidePtrAdd, {Ptr, Offset},
GEPNoWrapFlags::none(), DL, Name));
}

VPPhi *createScalarPhi(ArrayRef<VPValue *> IncomingValues, DebugLoc DL,
const Twine &Name = "") {
Expand Down
15 changes: 0 additions & 15 deletions llvm/lib/Transforms/Vectorize/VPlan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1043,21 +1043,6 @@ void VPlan::execute(VPTransformState *State) {
if (isa<VPWidenPHIRecipe>(&R))
continue;

if (auto *WidenPhi = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
assert(!WidenPhi->onlyScalarsGenerated(State->VF.isScalable()) &&
"recipe generating only scalars should have been replaced");
auto *GEP = cast<GetElementPtrInst>(State->get(WidenPhi));
PHINode *Phi = cast<PHINode>(GEP->getPointerOperand());

Phi->setIncomingBlock(1, VectorLatchBB);

// Move the last step to the end of the latch block. This ensures
// consistent placement of all induction updates.
Instruction *Inc = cast<Instruction>(Phi->getIncomingValue(1));
Inc->moveBefore(std::prev(VectorLatchBB->getTerminator()->getIterator()));
continue;
}

auto *PhiR = cast<VPSingleDefRecipe>(&R);
// VPInstructions currently model scalar Phis only.
bool NeedsScalar = isa<VPInstruction>(PhiR) ||
Expand Down
19 changes: 8 additions & 11 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -961,6 +961,9 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
// operand). Only generates scalar values (either for the first lane only or
// for all lanes, depending on its uses).
PtrAdd,
// Add a vector offset in bytes (second operand) to a scalar base pointer
// (first operand).
WidePtrAdd,
// Returns a scalar boolean value, which is true if any lane of its
// (boolean) vector operands is true. It produces the reduced value across
// all unrolled iterations. Unrolling will add all copies of its original
Expand Down Expand Up @@ -2064,8 +2067,7 @@ class VPWidenIntOrFpInductionRecipe : public VPWidenInductionRecipe {
}
};

class VPWidenPointerInductionRecipe : public VPWidenInductionRecipe,
public VPUnrollPartAccessor<4> {
class VPWidenPointerInductionRecipe : public VPWidenInductionRecipe {
bool IsScalarAfterVectorization;

public:
Expand Down Expand Up @@ -2093,19 +2095,14 @@ class VPWidenPointerInductionRecipe : public VPWidenInductionRecipe,

VP_CLASSOF_IMPL(VPDef::VPWidenPointerInductionSC)

/// Generate vector values for the pointer induction.
void execute(VPTransformState &State) override;
void execute(VPTransformState &State) override {
llvm_unreachable("cannot execute this recipe, should be expanded via "
"expandVPWidenIntOrFpInductionRecipe");
};

/// Returns true if only scalar values will be generated.
bool onlyScalarsGenerated(bool IsScalable);

/// Returns the VPValue representing the value of this induction at
/// the first unrolled part, if it exists. Returns itself if unrolling did not
/// take place.
VPValue *getFirstUnrolledPartOperand() {
return getUnrollPart(*this) == 0 ? this : getOperand(3);
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe.
void print(raw_ostream &O, const Twine &Indent,
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
return IntegerType::get(Ctx, 1);
case VPInstruction::Broadcast:
case VPInstruction::PtrAdd:
case VPInstruction::WidePtrAdd:
// Return the type based on first operand.
return inferScalarType(R->getOperand(0));
case VPInstruction::BranchOnCond:
Expand Down
101 changes: 14 additions & 87 deletions llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -475,6 +475,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
case VPInstruction::FirstOrderRecurrenceSplice:
case VPInstruction::LogicalAnd:
case VPInstruction::PtrAdd:
case VPInstruction::WidePtrAdd:
case VPInstruction::WideIVStep:
return 2;
case Instruction::Select:
Expand Down Expand Up @@ -513,6 +514,7 @@ bool VPInstruction::canGenerateScalarForFirstLane() const {
case VPInstruction::CalculateTripCountMinusVF:
case VPInstruction::CanonicalIVIncrementForPart:
case VPInstruction::PtrAdd:
case VPInstruction::WidePtrAdd:
case VPInstruction::ExplicitVectorLength:
case VPInstruction::AnyOf:
return true;
Expand Down Expand Up @@ -854,6 +856,11 @@ Value *VPInstruction::generate(VPTransformState &State) {
Value *Addend = State.get(getOperand(1), VPLane(0));
return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags());
}
case VPInstruction::WidePtrAdd: {
Value *Ptr = State.get(getOperand(0), true);
Value *Addend = State.get(getOperand(1), vputils::onlyFirstLaneUsed(this));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to check for firstLaneUsed here? Ideally WidePtrAdd wouldn't be used if the only a single lane is needed

Copy link
Contributor Author

@lukel97 lukel97 Jul 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oddly enough, vputils::onlyFirstLaneUsed triggers on two test cases Transforms/LoopVectorize/AArch64/sve-widen-gep.ll and Transforms/LoopVectorize/X86/pr48340.ll.

In at least pr48340 it comes from an unrolled pointer induction where the second unrolled gep isn't used?

vector.body:
  EMIT-SCALAR vp<%index> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<%index.next>, vector.body ]
  EMIT-SCALAR vp<%pointer.phi> = phi [ ir<%p>, ir-bb<vector.ph> ], [ vp<%ptr.ind>, vector.body ]
  EMIT vp<%2> = mul ir<4>, ir<3>
  EMIT vp<%3> = broadcast vp<%2>
  EMIT vp<%4> = step-vector i64
  EMIT vp<%5> = add vp<%3>, vp<%4>
  EMIT vp<%6> = mul vp<%5>, ir<1024>
  EMIT vp<%vector.gep> = wide-ptradd vp<%pointer.phi>, vp<%6>
  EMIT vp<%7> = mul ir<1024>, ir<16>
  EMIT vp<%8> = mul ir<4>, ir<0>
  EMIT vp<%9> = broadcast vp<%8>
  EMIT vp<%10> = step-vector i64
  EMIT vp<%11> = add vp<%9>, vp<%10>
  EMIT vp<%12> = mul vp<%11>, ir<1024>
  EMIT vp<%vector.gep>.1 = wide-ptradd vp<%pointer.phi>, vp<%12>
  WIDEN ir<%v> = load vp<%vector.gep>
  EMIT vp<%index.next> = add nuw vp<%index>, ir<16>
  EMIT vp<%ptr.ind> = ptradd vp<%pointer.phi>, vp<%7>
  EMIT branch-on-count vp<%index.next>, ir<%n.vec>
Successor(s): middle.block, vector.body

middle.block:
  EMIT vp<%14> = extract-last-element ir<%v>
  EMIT vp<%cmp.n> = icmp eq ir<%3>, ir<%n.vec>
  EMIT branch-on-cond vp<%cmp.n>
Successor(s): ir-bb<exit>, ir-bb<scalar.ph>

ir-bb<exit>:
  IR   %v.lcssa = phi ptr [ %v, %loop ] (extra operand: vp<%14> from middle.block)
No successors

So onlyFirstLaneUsed returns true, and we need to continue to generate a scalar for it to avoid a (mild) regression:

-; CHECK-NEXT:    [[VECTOR_GEP4:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], i64 0
+; CHECK-NEXT:    [[VECTOR_GEP4:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> <i64 0, i64 1024, i64 2048, i64 3072>

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh I was looking at the wrong diff sorry, it turns out the existing tests also generated a vector. I will remove the onlyFirstLaneUsed check

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done in 0315ec1

return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags());
}
case VPInstruction::AnyOf: {
Value *Res = State.get(getOperand(0));
for (VPValue *Op : drop_begin(operands()))
Expand Down Expand Up @@ -1045,6 +1052,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
case VPInstruction::LogicalAnd:
case VPInstruction::Not:
case VPInstruction::PtrAdd:
case VPInstruction::WidePtrAdd:
case VPInstruction::WideIVStep:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
case VPInstruction::WidePtrAdd:
case VPInstruction::WideIVStep:
case VPInstruction::WideIVStep:
case VPInstruction::WidePtrAdd:

lex order (even though its not consistent throughout)

case VPInstruction::StepVector:
case VPInstruction::ReductionStartVector:
Expand Down Expand Up @@ -1082,6 +1090,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
case VPInstruction::ReductionStartVector:
return true;
case VPInstruction::PtrAdd:
case VPInstruction::WidePtrAdd:
return Op == getOperand(0) || vputils::onlyFirstLaneUsed(this);
case VPInstruction::ComputeAnyOfResult:
case VPInstruction::ComputeFindIVResult:
Expand Down Expand Up @@ -1185,6 +1194,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::PtrAdd:
O << "ptradd";
break;
case VPInstruction::WidePtrAdd:
O << "wide-ptradd";
break;
case VPInstruction::AnyOf:
O << "any-of";
break;
Expand Down Expand Up @@ -1769,7 +1781,8 @@ bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const {
return Opcode == Instruction::AShr;
case OperationType::GEPOp:
return Opcode == Instruction::GetElementPtr ||
Opcode == VPInstruction::PtrAdd;
Opcode == VPInstruction::PtrAdd ||
Opcode == VPInstruction::WidePtrAdd;
case OperationType::FPMathOp:
return Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
Opcode == Instruction::FSub || Opcode == Instruction::FNeg ||
Expand Down Expand Up @@ -3690,87 +3703,6 @@ bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) {
(!IsScalable || vputils::onlyFirstLaneUsed(this));
}

void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
assert(getInductionDescriptor().getKind() ==
InductionDescriptor::IK_PtrInduction &&
"Not a pointer induction according to InductionDescriptor!");
assert(State.TypeAnalysis.inferScalarType(this)->isPointerTy() &&
"Unexpected type.");
assert(!onlyScalarsGenerated(State.VF.isScalable()) &&
"Recipe should have been replaced");

unsigned CurrentPart = getUnrollPart(*this);

// Build a pointer phi
Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
Type *ScStValueType = ScalarStartValue->getType();

BasicBlock *VectorPH =
State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
PHINode *NewPointerPhi = nullptr;
if (CurrentPart == 0) {
IRBuilder<>::InsertPointGuard Guard(State.Builder);
if (State.Builder.GetInsertPoint() !=
State.Builder.GetInsertBlock()->getFirstNonPHIIt())
State.Builder.SetInsertPoint(
State.Builder.GetInsertBlock()->getFirstNonPHIIt());
NewPointerPhi = State.Builder.CreatePHI(ScStValueType, 2, "pointer.phi");
NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
NewPointerPhi->setDebugLoc(getDebugLoc());
} else {
// The recipe has been unrolled. In that case, fetch the single pointer phi
// shared among all unrolled parts of the recipe.
auto *GEP =
cast<GetElementPtrInst>(State.get(getFirstUnrolledPartOperand()));
NewPointerPhi = cast<PHINode>(GEP->getPointerOperand());
}

// A pointer induction, performed by using a gep
BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint();
Value *ScalarStepValue = State.get(getStepValue(), VPLane(0));
Type *PhiType = State.TypeAnalysis.inferScalarType(getStepValue());
Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
// Add induction update using an incorrect block temporarily. The phi node
// will be fixed after VPlan execution. Note that at this point the latch
// block cannot be used, as it does not exist yet.
// TODO: Model increment value in VPlan, by turning the recipe into a
// multi-def and a subclass of VPHeaderPHIRecipe.
if (CurrentPart == 0) {
// The recipe represents the first part of the pointer induction. Create the
// GEP to increment the phi across all unrolled parts.
Value *NumUnrolledElems = State.get(getOperand(2), true);

Value *InductionGEP = GetElementPtrInst::Create(
State.Builder.getInt8Ty(), NewPointerPhi,
State.Builder.CreateMul(
ScalarStepValue,
State.Builder.CreateTrunc(NumUnrolledElems, PhiType)),
"ptr.ind", InductionLoc);

NewPointerPhi->addIncoming(InductionGEP, VectorPH);
}

// Create actual address geps that use the pointer phi as base and a
// vectorized version of the step value (<step*0, ..., step*N>) as offset.
Type *VecPhiType = VectorType::get(PhiType, State.VF);
Value *StartOffsetScalar = State.Builder.CreateMul(
RuntimeVF, ConstantInt::get(PhiType, CurrentPart));
Value *StartOffset =
State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
// Create a vector of consecutive numbers from zero to VF.
StartOffset = State.Builder.CreateAdd(
StartOffset, State.Builder.CreateStepVector(VecPhiType));

assert(ScalarStepValue == State.get(getOperand(1), VPLane(0)) &&
"scalar step must be the same across all parts");
Value *GEP = State.Builder.CreateGEP(
State.Builder.getInt8Ty(), NewPointerPhi,
State.Builder.CreateMul(StartOffset, State.Builder.CreateVectorSplat(
State.VF, ScalarStepValue)),
"vector.gep");
State.set(this, GEP);
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
Expand Down Expand Up @@ -3929,11 +3861,6 @@ void VPWidenPHIRecipe::execute(VPTransformState &State) {
Value *Op0 = State.get(getOperand(0));
Type *VecTy = Op0->getType();
Instruction *VecPhi = State.Builder.CreatePHI(VecTy, 2, Name);
// Manually move it with the other PHIs in case PHI recipes above this one
// also inserted non-phi instructions.
// TODO: Remove once VPWidenPointerInductionRecipe is also expanded in
// convertToConcreteRecipes.
VecPhi->moveBefore(State.Builder.GetInsertBlock()->getFirstNonPHIIt());
State.set(this, VecPhi);
}

Expand Down
107 changes: 107 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -973,6 +973,7 @@ static Value *tryToFoldLiveIns(const VPRecipeBase &R, unsigned Opcode,
RFlags.getGEPNoWrapFlags());
}
case VPInstruction::PtrAdd:
case VPInstruction::WidePtrAdd:
return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()), Ops[0],
Ops[1],
cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
Expand Down Expand Up @@ -2675,6 +2676,106 @@ expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR,
WidenIVR->replaceAllUsesWith(WidePHI);
}

/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
/// initial value, phi and backedge value. In the following example:
///
/// <x1> vector loop: {
/// vector.body:
/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
/// ...
/// EMIT branch-on-count ...
/// }
///
/// WIDEN-POINTER-INDUCTION will get expanded to:
///
/// <x1> vector loop: {
/// vector.body:
/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
/// EMIT %mul = mul %stepvector, %step
/// EMIT %vector.gep = ptradd %pointer.phi, %add
/// ...
/// EMIT %ptr.ind = ptradd %pointer.phi, %vf
/// EMIT branch-on-count ...
/// }
static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R,
VPTypeAnalysis &TypeInfo) {
VPlan *Plan = R->getParent()->getPlan();

assert(R->getInductionDescriptor().getKind() ==
InductionDescriptor::IK_PtrInduction &&
"Not a pointer induction according to InductionDescriptor!");
assert(TypeInfo.inferScalarType(R)->isPointerTy() && "Unexpected type.");
assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
"Recipe should have been replaced");

unsigned CurrentPart = 0;
if (R->getNumOperands() > 3)
CurrentPart =
cast<ConstantInt>(R->getOperand(4)->getLiveInIRValue())->getZExtValue();

VPBuilder Builder(R);
DebugLoc DL = R->getDebugLoc();

// Build a pointer phi
VPPhi *Phi;
if (CurrentPart == 0) {
Phi = Builder.createScalarPhi({R->getStartValue()}, R->getDebugLoc(),
"pointer.phi");
} else {
// The recipe has been unrolled. In that case, fetch the single pointer phi
// shared among all unrolled parts of the recipe.
auto *PtrAdd = cast<VPInstruction>(R->getOperand(3));
Phi = cast<VPPhi>(PtrAdd->getOperand(0)->getDefiningRecipe());
}

Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());

// A pointer induction, performed by using a gep
Type *PhiType = TypeInfo.inferScalarType(R->getStepValue());
VPValue *RuntimeVF = Builder.createScalarZExtOrTrunc(
&Plan->getVF(), PhiType, TypeInfo.inferScalarType(&Plan->getVF()), DL);
if (CurrentPart == 0) {
// The recipe represents the first part of the pointer induction. Create the
// GEP to increment the phi across all unrolled parts.
VPValue *NumUnrolledElems = Builder.createScalarZExtOrTrunc(
R->getOperand(2), PhiType, TypeInfo.inferScalarType(R->getOperand(2)),
DL);
VPValue *Offset = Builder.createNaryOp(
Instruction::Mul, {R->getStepValue(), NumUnrolledElems});

VPBuilder::InsertPointGuard Guard(Builder);
VPBasicBlock *ExitingBB =
Plan->getVectorLoopRegion()->getExitingBasicBlock();
Builder.setInsertPoint(ExitingBB,
ExitingBB->getTerminator()->getIterator());

VPValue *InductionGEP = Builder.createPtrAdd(Phi, Offset, DL, "ptr.ind");
Phi->addOperand(InductionGEP);
}

VPValue *CurrentPartV =
Plan->getOrAddLiveIn(ConstantInt::get(PhiType, CurrentPart));

// Create actual address geps that use the pointer phi as base and a
// vectorized version of the step value (<step*0, ..., step*N>) as offset.
VPValue *StartOffsetScalar =
Builder.createNaryOp(Instruction::Mul, {RuntimeVF, CurrentPartV});
VPValue *StartOffset =
Builder.createNaryOp(VPInstruction::Broadcast, StartOffsetScalar);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Broadcast needed due to running after materializeBroadcasts?

// Create a vector of consecutive numbers from zero to VF.
StartOffset = Builder.createNaryOp(
Instruction::Add,
{StartOffset,
Builder.createNaryOp(VPInstruction::StepVector, {}, PhiType)});
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thinking more about it now, could this use VPInstruction::WideIVStep or a variant of it and use a similar approach to how VPWidenIntOrFpInductionRecipes are handled?

The multiple parts are handled during unrolling, which may work for VPWidenPointerInductionRecipe as well. Unrolling would have to create a single scalar phi for the first part, and then something like GEP %scalar.ptr, wide-iv-step?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried this out and I was able to handle the unrolling in UnrollState::unrollWidenInductionByUF, alongside VPWidenIntOrFpInductionRecipe.

Just to double check though, we still need VPInstruction::WidePtrAdd since we still emit a vector of pointers which regular VPInstruction::PtrAdd can't do.

Just after unrolling the VPlan looks like:

vector.ph:
  vp<%3> = DERIVED-IV ir<%p> + vp<%2> * ir<32>
  EMIT vp<%4> = wide-iv-step vp<%0>, ir<32>
Successor(s): vector loop

<x1> vector loop: {
  vector.body:
    EMIT vp<%5> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
    EMIT ir<%p.iv> = WIDEN-POINTER-INDUCTION ir<%p>, ir<32>, vp<%1>, vp<%4>, vp<%step.add.3>
    EMIT vp<%step.add> = wide-ptradd ir<%p.iv>, vp<%4>
    EMIT vp<%step.add.2> = wide-ptradd vp<%step.add>, vp<%4>
    EMIT vp<%step.add.3> = wide-ptradd vp<%step.add.2>, vp<%4>
    vp<%6> = SCALAR-STEPS vp<%5>, ir<1>, vp<%0>
    CLONE ir<%gep> = getelementptr ir<%p>, vp<%6>
    vp<%7> = vector-pointer ir<%gep>
    vp<%8> = vector-pointer ir<%gep>, ir<1>
    vp<%9> = vector-pointer ir<%gep>, ir<2>
    vp<%10> = vector-pointer ir<%gep>, ir<3>
    WIDEN store vp<%7>, ir<%p.iv>
    WIDEN store vp<%8>, vp<%step.add>
    WIDEN store vp<%9>, vp<%step.add.2>
    WIDEN store vp<%10>, vp<%step.add.3>
    EMIT vp<%index.next> = add nuw vp<%5>, vp<%1>
    EMIT branch-on-count vp<%index.next>, vp<%2>
  No successors
}

And then when converted to concrete recipes:

ir-bb<vector.ph>:
  IR   %n.mod.vf = urem i64 %n, 16
  IR   %n.vec = sub i64 %n, %n.mod.vf
  vp<%1> = DERIVED-IV ir<%p> + ir<%n.vec> * ir<32>
  EMIT vp<%2> = mul ir<4>, ir<32>
Successor(s): vector.body

vector.body:
  EMIT-SCALAR vp<%index> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<%index.next>, vector.body ]
  EMIT-SCALAR vp<%pointer.phi> = phi [ ir<%p>, ir-bb<vector.ph> ], [ vp<%ptr.ind>, vector.body ]
  EMIT vp<%3> = step-vector i64
  EMIT vp<%4> = mul vp<%3>, ir<32>
  EMIT vp<%vector.gep> = wide-ptradd vp<%pointer.phi>, vp<%4>
  EMIT vp<%step.add> = wide-ptradd vp<%vector.gep>, vp<%2>
  EMIT vp<%step.add.2> = wide-ptradd vp<%step.add>, vp<%2>
  EMIT vp<%step.add.3> = wide-ptradd vp<%step.add.2>, vp<%2>
  CLONE ir<%gep> = getelementptr ir<%p>, vp<%index>
  vp<%5> = vector-pointer ir<%gep>, ir<1>
  vp<%6> = vector-pointer ir<%gep>, ir<2>
  vp<%7> = vector-pointer ir<%gep>, ir<3>
  WIDEN store ir<%gep>, vp<%vector.gep>
  WIDEN store vp<%5>, vp<%step.add>
  WIDEN store vp<%6>, vp<%step.add.2>
  WIDEN store vp<%7>, vp<%step.add.3>
  EMIT vp<%index.next> = add nuw vp<%index>, ir<16>
  EMIT vp<%ptr.ind> = ptradd vp<%step.add.3>, vp<%2>
  EMIT branch-on-count vp<%index.next>, ir<%n.vec>
Successor(s): middle.block, vector.body

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be done now in 03369e2, it makes the expansion a good bit simpler. We don't actually end up needing to special case unrolling because VFxUF is actually passed to the "VF" operand. I think there's probably a better name for this, "VF" is a bit misleading.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good to see this worked out, thanks!


VPValue *PtrAdd = Builder.createWidePtrAdd(
Phi,
Builder.createNaryOp(Instruction::Mul, {StartOffset, R->getStepValue()}),
DL, "vector.gep");

R->replaceAllUsesWith(PtrAdd);
}

void VPlanTransforms::dissolveLoopRegions(VPlan &Plan) {
// Replace loop regions with explicity CFG.
SmallVector<VPRegionBlock *> LoopRegions;
Expand Down Expand Up @@ -2711,6 +2812,12 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan,
continue;
}

if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
expandVPWidenPointerInduction(WidenIVR, TypeInfo);
ToRemove.push_back(WidenIVR);
continue;
}

if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
Expr->decompose();
ToRemove.push_back(Expr);
Expand Down
Loading
Loading