Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -1021,6 +1021,8 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
// part if scalar. In the latter case, the recipe will be removed during
// unrolling.
ExtractLastElement,
// Extracts the last lane for each part from its operand.
ExtractLastLanePerPart,
Comment on lines 1012 to +1016
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Given ExtractLastLanePerPart which produces a scalar per part, should ExtractLastPart also be added - which together could then replace ExtractLastElement? Perhaps suffice to call it ExtractLastLane(*) - producing a scalar from (each) vector, orthogonal of how many parts/vectors it is given. I.e., ExtractLastElement could then be replaced by ExtractLastPart if its operand is scalar or else by ExtractLastPart(ExtractLastLane) or ExtractLastLane(ExtractLastPart) if it's vector.

(*) ExtractLane seems to apply similar double extraction, which could also be simplified by combining with some "de-splat" reduction of a uniform-across-UF value that provides a(ny) single part?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah yes, I will check that as follow-up, thanks!

// Extracts the second-to-last lane from its operand or the second-to-last
// part if it is scalar. In the latter case, the recipe will be removed
// during unrolling.
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
case VPInstruction::FirstActiveLane:
return Type::getIntNTy(Ctx, 64);
case VPInstruction::ExtractLastElement:
case VPInstruction::ExtractLastLanePerPart:
case VPInstruction::ExtractPenultimateElement: {
Type *BaseTy = inferScalarType(R->getOperand(0));
if (auto *VecTy = dyn_cast<VectorType>(BaseTy))
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
Original file line number Diff line number Diff line change
Expand Up @@ -368,6 +368,12 @@ m_ExtractLastElement(const Op0_t &Op0) {
return m_VPInstruction<VPInstruction::ExtractLastElement>(Op0);
}

template <typename Op0_t>
inline VPInstruction_match<VPInstruction::ExtractLastLanePerPart, Op0_t>
m_ExtractLastLanePerPart(const Op0_t &Op0) {
return m_VPInstruction<VPInstruction::ExtractLastLanePerPart>(Op0);
}

template <typename Op0_t, typename Op1_t, typename Op2_t>
inline VPInstruction_match<VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t>
m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) {
Expand Down
10 changes: 9 additions & 1 deletion llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -511,6 +511,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
case VPInstruction::CanonicalIVIncrementForPart:
case VPInstruction::ExplicitVectorLength:
case VPInstruction::ExtractLastElement:
case VPInstruction::ExtractLastLanePerPart:
case VPInstruction::ExtractPenultimateElement:
case VPInstruction::FirstActiveLane:
case VPInstruction::Not:
Expand Down Expand Up @@ -878,9 +879,11 @@ Value *VPInstruction::generate(VPTransformState &State) {

return ReducedPartRdx;
}
case VPInstruction::ExtractLastLanePerPart:
case VPInstruction::ExtractLastElement:
case VPInstruction::ExtractPenultimateElement: {
unsigned Offset = getOpcode() == VPInstruction::ExtractLastElement ? 1 : 2;
unsigned Offset =
getOpcode() == VPInstruction::ExtractPenultimateElement ? 2 : 1;
Value *Res;
if (State.VF.isVector()) {
assert(Offset <= State.VF.getKnownMinValue() &&
Expand Down Expand Up @@ -1166,6 +1169,7 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,

bool VPInstruction::isVectorToScalar() const {
return getOpcode() == VPInstruction::ExtractLastElement ||
getOpcode() == VPInstruction::ExtractLastLanePerPart ||
getOpcode() == VPInstruction::ExtractPenultimateElement ||
getOpcode() == Instruction::ExtractElement ||
getOpcode() == VPInstruction::ExtractLane ||
Expand Down Expand Up @@ -1229,6 +1233,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
case VPInstruction::CanonicalIVIncrementForPart:
case VPInstruction::ExtractLane:
case VPInstruction::ExtractLastElement:
case VPInstruction::ExtractLastLanePerPart:
case VPInstruction::ExtractPenultimateElement:
case VPInstruction::ActiveLaneMask:
case VPInstruction::FirstActiveLane:
Expand Down Expand Up @@ -1376,6 +1381,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::ExtractLastElement:
O << "extract-last-element";
break;
case VPInstruction::ExtractLastLanePerPart:
O << "extract-last-lane-per-part";
break;
case VPInstruction::ExtractPenultimateElement:
O << "extract-penultimate-element";
break;
Expand Down
28 changes: 21 additions & 7 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1207,7 +1207,8 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
}

// Look through ExtractLastElement (BuildVector ....).
if (match(&R, m_ExtractLastElement(m_BuildVector()))) {
if (match(&R, m_CombineOr(m_ExtractLastElement(m_BuildVector()),
m_ExtractLastLanePerPart(m_BuildVector())))) {
auto *BuildVector = cast<VPInstruction>(R.getOperand(0));
Def->replaceAllUsesWith(
BuildVector->getOperand(BuildVector->getNumOperands() - 1));
Expand Down Expand Up @@ -1273,20 +1274,29 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
return;
}

if (match(Def, m_ExtractLastElement(m_Broadcast(m_VPValue(A))))) {
if (match(Def,
m_CombineOr(m_ExtractLastElement(m_Broadcast(m_VPValue(A))),
m_ExtractLastLanePerPart(m_Broadcast(m_VPValue(A)))))) {
Def->replaceAllUsesWith(A);
return;
}

if (match(Def,
m_VPInstruction<VPInstruction::ExtractLastElement>(m_VPValue(A))) &&
if (match(Def, m_CombineOr(m_VPInstruction<VPInstruction::ExtractLastElement>(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: Since you're here, perhaps worth changing m_VPInstruction<VPInstruction::ExtractLastElement> to m_ExtractLastElement as well?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep updated thanks

m_VPValue(A)),
m_ExtractLastLanePerPart(m_VPValue(A)))) &&
((isa<VPInstruction>(A) && vputils::isSingleScalar(A)) ||
(isa<VPReplicateRecipe>(A) &&
cast<VPReplicateRecipe>(A)->isSingleScalar())) &&
all_of(A->users(),
[Def, A](VPUser *U) { return U->usesScalars(A) || Def == U; })) {
return Def->replaceAllUsesWith(A);
}

if (Plan->getUF() == 1 &&
match(Def, m_ExtractLastLanePerPart(m_VPValue(A)))) {
return Def->replaceAllUsesWith(
Builder.createNaryOp(VPInstruction::ExtractLastElement, {A}));
}
}

void VPlanTransforms::simplifyRecipes(VPlan &Plan) {
Expand Down Expand Up @@ -1324,8 +1334,11 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Metadata*/);
Clone->insertBefore(RepOrWidenR);
auto *Ext = new VPInstruction(VPInstruction::ExtractLastElement,
{Clone->getOperand(0)});
unsigned ExtractOpc =
vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1))
? VPInstruction::ExtractLastElement
: VPInstruction::ExtractLastLanePerPart;
auto *Ext = new VPInstruction(ExtractOpc, {Clone->getOperand(0)});
Ext->insertBefore(Clone);
Clone->setOperand(0, Ext);
RepR->eraseFromParent();
Expand All @@ -1339,7 +1352,8 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
!all_of(RepOrWidenR->users(), [RepOrWidenR](const VPUser *U) {
return U->usesScalars(RepOrWidenR) ||
match(cast<VPRecipeBase>(U),
m_ExtractLastElement(m_VPValue()));
m_CombineOr(m_ExtractLastElement(m_VPValue()),
m_ExtractLastLanePerPart(m_VPValue())));
}))
continue;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -153,17 +153,20 @@ define void @uniform_gep_for_replicating_gep(ptr %dst) {
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 2
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <2 x i32> [[STEP_ADD]], zeroinitializer
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <2 x i32> [[VEC_IND]], zeroinitializer
; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <2 x i32> [[STEP_ADD]], zeroinitializer
; CHECK-NEXT: [[TMP8:%.*]] = lshr i32 [[INDEX]], 1
; CHECK-NEXT: [[TMP9:%.*]] = lshr i32 [[TMP2]], 1
; CHECK-NEXT: [[TMP11:%.*]] = zext <2 x i1> [[TMP5]] to <2 x i8>
; CHECK-NEXT: [[TMP6:%.*]] = zext <2 x i1> [[TMP3]] to <2 x i8>
; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP8]] to i64
; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP9]] to i64
; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP14]]
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP15]]
; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i8> [[TMP11]], i32 1
; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i8> [[TMP6]], i32 1
; CHECK-NEXT: store i8 [[TMP22]], ptr [[TMP18]], align 1
; CHECK-NEXT: store i8 [[TMP22]], ptr [[TMP19]], align 1
; CHECK-NEXT: store i8 [[TMP12]], ptr [[TMP19]], align 1
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], splat (i32 2)
; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128
Expand Down
9 changes: 5 additions & 4 deletions llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,7 @@ exit:
ret void
}

; FIXME: Currently this mis-compiled when interleaving; all stores store the
; last lane of the last part, instead of the last lane per part.
; Check each unrolled store stores the last lane of the corresponding part.
; Test case for https://github.com/llvm/llvm-project/issues/162498.
define void @narrow_to_single_scalar_store_address_not_uniform_across_all_parts(ptr %dst) {
; VF4IC1-LABEL: define void @narrow_to_single_scalar_store_address_not_uniform_across_all_parts(
Expand Down Expand Up @@ -121,13 +120,15 @@ define void @narrow_to_single_scalar_store_address_not_uniform_across_all_parts(
; VF2IC2-NEXT: br label %[[VECTOR_BODY:.*]]
; VF2IC2: [[VECTOR_BODY]]:
; VF2IC2-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; VF2IC2-NEXT: [[TMP7:%.*]] = add i32 [[INDEX]], 0
; VF2IC2-NEXT: [[TMP8:%.*]] = add i32 [[INDEX]], 1
; VF2IC2-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 2
; VF2IC2-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 3
; VF2IC2-NEXT: [[TMP2:%.*]] = lshr i32 [[INDEX]], 1
; VF2IC2-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP7]], 1
; VF2IC2-NEXT: [[TMP3:%.*]] = lshr i32 [[TMP0]], 1
; VF2IC2-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[DST]], i32 [[TMP2]]
; VF2IC2-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[DST]], i32 [[TMP3]]
; VF2IC2-NEXT: store i32 [[TMP1]], ptr [[TMP4]], align 4
; VF2IC2-NEXT: store i32 [[TMP8]], ptr [[TMP4]], align 4
; VF2IC2-NEXT: store i32 [[TMP1]], ptr [[TMP5]], align 4
; VF2IC2-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; VF2IC2-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 100
Expand Down