Skip to content
Open
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7192,7 +7192,8 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
// TODO: Move to VPlan transform stage once the transition to the VPlan-based
// cost model is complete for better cost estimates.
Comment on lines 7192 to 7193
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this comment still relevant?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, comment indicates that these transforms are still taking place during VPlan execution rather than planning. Perhaps worth rewording to clarify.

VPlanTransforms::runPass(VPlanTransforms::unrollByUF, BestVPlan, BestUF);
VPlanTransforms::runPass(VPlanTransforms::materializeBuildVectors, BestVPlan);
VPlanTransforms::runPass(VPlanTransforms::materializePacksAndUnpacks,
BestVPlan);
VPlanTransforms::runPass(VPlanTransforms::materializeBroadcasts, BestVPlan);
VPlanTransforms::runPass(VPlanTransforms::replicateByVF, BestVPlan, BestVF);
bool HasBranchWeights =
Expand Down
14 changes: 14 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -1064,6 +1064,11 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
ResumeForEpilogue,
/// Returns the value for vscale.
VScale,
/// Extracts all lanes from its (non-scalable) vector operand. This is an
/// abstract VPInstruction whose single defined VPValue represents VF
/// scalars extracted from a vector, to be replaced by VF ExtractElement
/// VPInstructions?
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
/// VPInstructions?
/// VPInstructions.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed ,thanks

Unpack,
};

/// Returns true if this VPInstruction generates scalar values for all lanes.
Expand Down Expand Up @@ -2718,6 +2723,15 @@ class LLVM_ABI_FOR_TEST VPReductionRecipe : public VPRecipeWithIRFlags {
return R && classof(R);
}

static inline bool classof(const VPValue *VPV) {
const VPRecipeBase *R = VPV->getDefiningRecipe();
return R && classof(R);
}

static inline bool classof(const VPSingleDefRecipe *R) {
return classof(static_cast<const VPRecipeBase *>(R));
}

Comment on lines +2725 to +2733
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, related to the introduction of unpack?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's needed for this change, due to the new cast/dyn_cast/isa with VPSingleDefRecipe

/// Generate the reduction in the loop.
void execute(VPTransformState &State) override;

Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
case VPInstruction::AnyOf:
case VPInstruction::BuildStructVector:
case VPInstruction::BuildVector:
case VPInstruction::Unpack:
return SetResultTyFromOp();
case VPInstruction::ExtractLane:
return inferScalarType(R->getOperand(1));
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
Original file line number Diff line number Diff line change
Expand Up @@ -368,6 +368,12 @@ m_ExtractLastElement(const Op0_t &Op0) {
return m_VPInstruction<VPInstruction::ExtractLastElement>(Op0);
}

template <typename Op0_t, typename Op1_t>
inline VPInstruction_match<Instruction::ExtractElement, Op0_t, Op1_t>
m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1) {
return m_VPInstruction<Instruction::ExtractElement>(Op0, Op1);
}

template <typename Op0_t, typename Op1_t, typename Op2_t>
inline VPInstruction_match<VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t>
m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) {
Expand Down
7 changes: 5 additions & 2 deletions llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -514,6 +514,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
case VPInstruction::ExtractPenultimateElement:
case VPInstruction::FirstActiveLane:
case VPInstruction::Not:
case VPInstruction::Unpack:
return 1;
case Instruction::ICmp:
case Instruction::FCmp:
Expand Down Expand Up @@ -1240,6 +1241,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
case VPInstruction::StepVector:
case VPInstruction::ReductionStartVector:
case VPInstruction::VScale:
case VPInstruction::Unpack:
return false;
default:
return true;
Expand Down Expand Up @@ -1283,8 +1285,6 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
return getNumOperands() > 1;
case VPInstruction::PtrAdd:
return Op == getOperand(0) || vputils::onlyFirstLaneUsed(this);
case VPInstruction::WidePtrAdd:
return Op == getOperand(0);
Comment on lines 1287 to 1288
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unrelated?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The current code here doesn't match the codegen for WidePtrAdd which is exposed by this patch; it only uses the first lane, if the pointer is a single scalar.

With this code, we would incorrectly insert an unpack and only provide the first lane instead of the vector pointers that is needed.

Comment on lines 1287 to 1288
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How/Is this related?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The patch exposed an issue here, more details in https://github.com/llvm/llvm-project/pull/155670/files#r2347298616 above

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Defaulting to false seems a bit odd, as the first operand should be a scalar base. Worth a comment?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added a comment. It supports both vector and scalar base addresses.

case VPInstruction::ComputeAnyOfResult:
case VPInstruction::ComputeFindIVResult:
return Op == getOperand(1);
Expand Down Expand Up @@ -1408,6 +1408,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::ResumeForEpilogue:
O << "resume-for-epilogue";
break;
case VPInstruction::Unpack:
O << "unpack";
break;
default:
O << Instruction::getOpcodeName(getOpcode());
}
Expand Down
45 changes: 44 additions & 1 deletion llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1223,6 +1223,13 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
return;
}

uint64_t Idx;
if (match(&R, m_ExtractElement(m_BuildVector(), m_ConstantInt(Idx)))) {
auto *BuildVector = cast<VPInstruction>(R.getOperand(0));
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could m_BuildVector(Op0) provide the desired operand?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, I think it could, but would be inconsistent with other matchers, where the inner matcher match the operands.

Def->replaceAllUsesWith(BuildVector->getOperand(Idx));
return;
}

if (auto *Phi = dyn_cast<VPPhi>(Def)) {
if (Phi->getNumOperands() == 1)
Phi->replaceAllUsesWith(Phi->getOperand(0));
Expand Down Expand Up @@ -3740,7 +3747,7 @@ void VPlanTransforms::materializeBackedgeTakenCount(VPlan &Plan,
BTC->replaceAllUsesWith(TCMO);
}

void VPlanTransforms::materializeBuildVectors(VPlan &Plan) {
void VPlanTransforms::materializePacksAndUnpacks(VPlan &Plan) {
if (Plan.hasScalarVFOnly())
return;

Expand Down Expand Up @@ -3789,6 +3796,42 @@ void VPlanTransforms::materializeBuildVectors(VPlan &Plan) {
});
}
}

// Create explicit VPInstructions to convert vectors to scalars.
for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
if (isa<VPReplicateRecipe, VPInstruction, VPScalarIVStepsRecipe>(&R))
continue;
for (VPValue *Def : R.definedValues()) {
if (vputils::isSingleScalar(Def) || vputils::onlyFirstLaneUsed(Def))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is trying to identify all vector Def's, by excluding those that define a single scalar and those that define multiple scalars (replicate and scalar steps recipes). Better try to capture directly by some vputils? VPInstructions that define vectors are also excluded?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The onlyFirstLaneUsed case may require a single extract rather than full unpack.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, I added a comment that the recipes skipped here may or may not produce a vector eventually and a TODO to introduce unpacks for them as well, and remove them once we are guaranteed to produce scalars.

For that reason, I left the code as is here for now.

continue;

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Worth explaining how scalar users inside replicate regions (continue to) extract their values w/o going through an Unpack VPInstruction.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done and added TODO, thanks!

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks! Better placed before the lambda?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should be moved, thanks

auto IsInsideReplicateRegion = [LoopRegion](VPUser *U) {
VPRegionBlock *ParentRegion =
cast<VPRecipeBase>(U)->getParent()->getParent();
return ParentRegion && ParentRegion != LoopRegion;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Better check if ParentRegion is replicating, consistent with the lambda's name?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done thanks!

};
// At the moment, we only create unpacks for scalar users outside
// replicate regions. Recipes inside replicate regions still manually
// extract the required lanes. TODO: Remove once replicate regions are
// unrolled explicitly.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// unrolled explicitly.
// unrolled completely.

They are already unrolled explicitly by UF, but have yet to also be unrolled by VF.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, updated, thanks

if (none_of(Def->users(), [Def, &IsInsideReplicateRegion](VPUser *U) {
return !IsInsideReplicateRegion(U) && U->usesScalars(Def);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
return !IsInsideReplicateRegion(U) && U->usesScalars(Def);
return U->usesScalars(Def) && !IsInsideReplicateRegion(U);

sounds a bit cleaner?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done thanks

}))
continue;

auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def});
if (R.isPhi())
Unpack->insertBefore(*VPBB, VPBB->getFirstNonPhi());
else
Unpack->insertAfter(&R);
Def->replaceUsesWithIf(
Unpack, [Def, &IsInsideReplicateRegion](VPUser &U, unsigned) {
return !IsInsideReplicateRegion(&U) && U.usesScalars(Def);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ditto
Worth defining an IsCandidateUnpackUser()?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated, merged IsInsideReplicateRegion in new IsCandidateUnpackUser

});
}
}
}
}

void VPlanTransforms::materializeVectorTripCount(VPlan &Plan,
Expand Down
5 changes: 3 additions & 2 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.h
Original file line number Diff line number Diff line change
Expand Up @@ -326,8 +326,9 @@ struct VPlanTransforms {
VPBasicBlock *VectorPH);

/// Add explicit Build[Struct]Vector recipes that combine multiple scalar
/// values into single vectors.
static void materializeBuildVectors(VPlan &Plan);
/// values into single vectors and UnpackVector to extract scalars from a
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
/// values into single vectors and UnpackVector to extract scalars from a
/// values into single vectors and Unpack recipes to extract scalars from a

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated, thanks

/// vector as needed.
static void materializePacksAndUnpacks(VPlan &Plan);

/// Materialize VF and VFxUF to be computed explicitly using VPInstructions.
static void materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH,
Expand Down
20 changes: 18 additions & 2 deletions llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -466,10 +466,21 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF) {
/// Create a single-scalar clone of \p DefR (must be a VPReplicateRecipe or
/// VPInstruction) for lane \p Lane. Use \p Def2LaneDefs to look up scalar
/// definitions for operands of \DefR.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Independent: "\p DefR" rather than "\DefR"

static VPRecipeWithIRFlags *
static VPValue *
cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
VPRecipeWithIRFlags *DefR, VPLane Lane,
const DenseMap<VPValue *, SmallVector<VPValue *>> &Def2LaneDefs) {
VPValue *Op;
if (match(DefR, m_VPInstruction<VPInstruction::Unpack>(m_VPValue(Op)))) {
auto LaneDefs = Def2LaneDefs.find(Op);
if (LaneDefs != Def2LaneDefs.end())
return LaneDefs->second[Lane.getKnownLane()];

VPValue *Idx =
Plan.getOrAddLiveIn(ConstantInt::get(IdxTy, Lane.getKnownLane()));
return Builder.createNaryOp(Instruction::ExtractElement, {Op, Idx});
}

// Collect the operands at Lane, creating extracts as needed.
SmallVector<VPValue *> NewOps;
for (VPValue *Op : DefR->operands()) {
Expand All @@ -481,6 +492,10 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
continue;
}
if (Lane.getKind() == VPLane::Kind::ScalableLast) {
// Look through mandatory Unpack.
[[maybe_unused]] bool Matched =
match(Op, m_VPInstruction<VPInstruction::Unpack>(m_VPValue(Op)));
assert(Matched && "original op must have been Unpack");
NewOps.push_back(
Builder.createNaryOp(VPInstruction::ExtractLastElement, {Op}));
continue;
Expand Down Expand Up @@ -548,7 +563,8 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
(isa<VPReplicateRecipe>(&R) &&
cast<VPReplicateRecipe>(&R)->isSingleScalar()) ||
(isa<VPInstruction>(&R) &&
!cast<VPInstruction>(&R)->doesGeneratePerAllLanes()))
!cast<VPInstruction>(&R)->doesGeneratePerAllLanes() &&
cast<VPInstruction>(&R)->getOpcode() != VPInstruction::Unpack))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should Unpack indicate that it doesGeneratePerAllLanes?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It may not generate for all lanes at the moment, if only its first lane is used and all cases need to be handled by unrolling at the moment.

continue;

auto *DefR = cast<VPRecipeWithIRFlags>(&R);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -241,12 +241,12 @@ define void @udiv_urem_feeding_gep(i64 %x, ptr %dst, i64 %N) {
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP23:%.*]] = udiv <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP29:%.*]] = extractelement <vscale x 2 x i64> [[TMP23]], i32 0
; CHECK-NEXT: [[TMP24:%.*]] = urem i64 [[INDEX]], [[MUL_2_I]]
; CHECK-NEXT: [[TMP25:%.*]] = udiv i64 [[TMP24]], [[MUL_1_I]]
; CHECK-NEXT: [[TMP26:%.*]] = urem i64 [[TMP24]], [[MUL_1_I]]
; CHECK-NEXT: [[TMP27:%.*]] = udiv i64 [[TMP26]], [[X]]
; CHECK-NEXT: [[TMP28:%.*]] = urem i64 [[TMP26]], [[X]]
; CHECK-NEXT: [[TMP29:%.*]] = extractelement <vscale x 2 x i64> [[TMP23]], i32 0
; CHECK-NEXT: [[TMP30:%.*]] = mul i64 [[X]], [[TMP29]]
; CHECK-NEXT: [[TMP31:%.*]] = add i64 [[TMP30]], [[TMP25]]
; CHECK-NEXT: [[TMP32:%.*]] = mul i64 [[TMP31]], [[X]]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,15 @@ define void @test_widen_ptr_induction(ptr %ptr.start.1) {
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x ptr> poison, ptr [[NEXT_GEP2]], i32 0
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x ptr> [[TMP6]], ptr [[NEXT_GEP3]], i32 1
; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <2 x ptr> [[TMP5]], zeroinitializer
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1
; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <2 x ptr> [[TMP7]], zeroinitializer
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP10]])
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP9]], i32 0
; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP9]], i32 1
; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]])
; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[TMP9]], i32 0
; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP12]])
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP9]], i32 1
; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP13]])
; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP14]])
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 2
; CHECK-NEXT: store <2 x i8> zeroinitializer, ptr [[NEXT_GEP]], align 1
; CHECK-NEXT: store <2 x i8> zeroinitializer, ptr [[TMP15]], align 1
Expand All @@ -61,8 +61,8 @@ define void @test_widen_ptr_induction(ptr %ptr.start.1) {
; CHECK-NEXT: [[TMP20:%.*]] = insertelement <2 x ptr> [[TMP19]], ptr [[NEXT_GEP8]], i32 1
; CHECK-NEXT: [[TMP21:%.*]] = icmp ne <2 x ptr> [[TMP20]], zeroinitializer
; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i1> [[TMP21]], i32 0
; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP22]])
; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x i1> [[TMP21]], i32 1
; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP22]])
; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP23]])
; CHECK-NEXT: store <2 x i8> zeroinitializer, ptr [[NEXT_GEP7]], align 1
; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX6]], 2
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ define i32 @test_phi_iterator_invalidation(ptr %A, ptr noalias %B) {
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 0>, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[PRED_LOAD_CONTINUE6]] ]
; CHECK-NEXT: [[TMP0:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 1)
; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 0
; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
; CHECK: pred.load.if:
Expand Down Expand Up @@ -59,7 +60,6 @@ define i32 @test_phi_iterator_invalidation(ptr %A, ptr noalias %B) {
; CHECK-NEXT: [[TMP24]] = phi <4 x i16> [ [[TMP18]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP23]], [[PRED_LOAD_IF5]] ]
; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[TMP24]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; CHECK-NEXT: [[TMP26:%.*]] = sext <4 x i16> [[TMP25]] to <4 x i32>
; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i32, ptr [[B:%.*]], i64 [[TMP27]]
; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP26]], ptr [[TMP28]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -284,17 +284,17 @@ define void @main_vector_loop_fixed_single_vector_iteration_with_runtime_checks(
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[GEP_J]], align 8
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
; CHECK-NEXT: [[TMP5:%.*]] = trunc <4 x i64> [[STRIDED_VEC]] to <4 x i16>
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i16> [[TMP5]], i32 0
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i16> [[TMP5]], i32 1
; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i16, ptr [[K]], i64 [[IV]]
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i16, ptr [[K]], i64 [[TMP1]]
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i16, ptr [[K]], i64 [[TMP2]]
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i16, ptr [[K]], i64 [[TMP3]]
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i16> [[TMP5]], i32 0
; CHECK-NEXT: store i16 [[TMP10]], ptr [[TMP6]], align 2
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i16> [[TMP5]], i32 1
; CHECK-NEXT: store i16 [[TMP11]], ptr [[TMP7]], align 2
; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2
; CHECK-NEXT: store i16 [[TMP12]], ptr [[TMP8]], align 2
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3
; CHECK-NEXT: store i16 [[TMP13]], ptr [[TMP9]], align 2
; CHECK-NEXT: store i64 0, ptr [[A]], align 8
; CHECK-NEXT: store i64 0, ptr [[B]], align 8
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -205,25 +205,25 @@ define void @test_load_gep_widen_induction(ptr noalias %dst, ptr noalias %dst2)
; CHECK-NEXT: [[STEP_ADD_2:%.*]] = add <2 x i64> [[STEP_ADD]], splat (i64 2)
; CHECK-NEXT: [[STEP_ADD_3:%.*]] = add <2 x i64> [[STEP_ADD_2]], splat (i64 2)
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i128, ptr [[DST]], <2 x i64> [[VEC_IND]]
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x ptr> [[TMP0]], i32 0
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x ptr> [[TMP0]], i32 1
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i128, ptr [[DST]], <2 x i64> [[STEP_ADD]]
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i128, ptr [[DST]], <2 x i64> [[STEP_ADD_2]]
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x ptr> [[TMP2]], i32 0
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x ptr> [[TMP2]], i32 1
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i128, ptr [[DST]], <2 x i64> [[STEP_ADD_3]]
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP0]], i32 0
; CHECK-NEXT: store ptr null, ptr [[TMP4]], align 8
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x ptr> [[TMP0]], i32 1
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 0
; CHECK-NEXT: [[TMP17:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 1
; CHECK-NEXT: store ptr null, ptr [[TMP5]], align 8
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0
; CHECK-NEXT: store ptr null, ptr [[TMP6]], align 8
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
; CHECK-NEXT: store ptr null, ptr [[TMP7]], align 8
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x ptr> [[TMP2]], i32 0
; CHECK-NEXT: store ptr null, ptr [[TMP8]], align 8
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x ptr> [[TMP2]], i32 1
; CHECK-NEXT: store ptr null, ptr [[TMP9]], align 8
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 0
; CHECK-NEXT: store ptr null, ptr [[TMP10]], align 8
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 1
; CHECK-NEXT: store ptr null, ptr [[TMP11]], align 8
; CHECK-NEXT: store ptr null, ptr [[TMP17]], align 8
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This store to TMP17 replaces the one removed above to TMP4?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, ok: this store to TMP17 replaces the one to TMP11, which is retained but TMP11 is redefined.

Wonder if some simple "sink each ExtractElement to before its single/first user" pass could help confirm all test differences?

; CHECK-NEXT: [[TMP12:%.*]] = getelementptr ptr, ptr [[DST2]], i64 [[OFFSET_IDX]]
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr ptr, ptr [[TMP12]], i32 2
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr ptr, ptr [[TMP12]], i32 4
Expand Down
Loading
Loading