Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -1012,6 +1012,8 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
// part if scalar. In the latter case, the recipe will be removed during
// unrolling.
ExtractLastElement,
// Extracts the last lane for each part from its operand.
ExtractLastLanePerPart,
Comment on lines 1012 to +1016
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Given ExtractLastLanePerPart which produces a scalar per part, should ExtractLastPart also be added - which together could then replace ExtractLastElement? Perhaps suffice to call it ExtractLastLane(*) - producing a scalar from (each) vector, orthogonal of how many parts/vectors it is given. I.e., ExtractLastElement could then be replaced by ExtractLastPart if its operand is scalar or else by ExtractLastPart(ExtractLastLane) or ExtractLastLane(ExtractLastPart) if it's vector.

(*) ExtractLane seems to apply similar double extraction, which could also be simplified by combining with some "de-splat" reduction of a uniform-across-UF value that provides a(ny) single part?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah yes, I will check that as follow-up, thanks!

// Extracts the second-to-last lane from its operand or the second-to-last
// part if it is scalar. In the latter case, the recipe will be removed
// during unrolling.
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
case VPInstruction::FirstActiveLane:
return Type::getIntNTy(Ctx, 64);
case VPInstruction::ExtractLastElement:
case VPInstruction::ExtractLastLanePerPart:
case VPInstruction::ExtractPenultimateElement: {
Type *BaseTy = inferScalarType(R->getOperand(0));
if (auto *VecTy = dyn_cast<VectorType>(BaseTy))
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,12 @@ m_ExtractLastElement(const Op0_t &Op0) {
return m_VPInstruction<VPInstruction::ExtractLastElement>(Op0);
}

template <typename Op0_t>
inline VPInstruction_match<VPInstruction::ExtractLastLanePerPart, Op0_t>
m_ExtractLastLanePerPart(const Op0_t &Op0) {
return m_VPInstruction<VPInstruction::ExtractLastLanePerPart>(Op0);
}

template <typename Op0_t, typename Op1_t, typename Op2_t>
inline VPInstruction_match<VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t>
m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) {
Expand Down
10 changes: 9 additions & 1 deletion llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -511,6 +511,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
case VPInstruction::CanonicalIVIncrementForPart:
case VPInstruction::ExplicitVectorLength:
case VPInstruction::ExtractLastElement:
case VPInstruction::ExtractLastLanePerPart:
case VPInstruction::ExtractPenultimateElement:
case VPInstruction::FirstActiveLane:
case VPInstruction::Not:
Expand Down Expand Up @@ -878,9 +879,11 @@ Value *VPInstruction::generate(VPTransformState &State) {

return ReducedPartRdx;
}
case VPInstruction::ExtractLastLanePerPart:
case VPInstruction::ExtractLastElement:
case VPInstruction::ExtractPenultimateElement: {
unsigned Offset = getOpcode() == VPInstruction::ExtractLastElement ? 1 : 2;
unsigned Offset =
getOpcode() == VPInstruction::ExtractPenultimateElement ? 2 : 1;
Value *Res;
if (State.VF.isVector()) {
assert(Offset <= State.VF.getKnownMinValue() &&
Expand Down Expand Up @@ -1166,6 +1169,7 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,

bool VPInstruction::isVectorToScalar() const {
return getOpcode() == VPInstruction::ExtractLastElement ||
getOpcode() == VPInstruction::ExtractLastLanePerPart ||
getOpcode() == VPInstruction::ExtractPenultimateElement ||
getOpcode() == Instruction::ExtractElement ||
getOpcode() == VPInstruction::ExtractLane ||
Expand Down Expand Up @@ -1229,6 +1233,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
case VPInstruction::CanonicalIVIncrementForPart:
case VPInstruction::ExtractLane:
case VPInstruction::ExtractLastElement:
case VPInstruction::ExtractLastLanePerPart:
case VPInstruction::ExtractPenultimateElement:
case VPInstruction::ActiveLaneMask:
case VPInstruction::FirstActiveLane:
Expand Down Expand Up @@ -1376,6 +1381,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::ExtractLastElement:
O << "extract-last-element";
break;
case VPInstruction::ExtractLastLanePerPart:
O << "extract-last-lane-per-part";
break;
case VPInstruction::ExtractPenultimateElement:
O << "extract-penultimate-element";
break;
Expand Down
27 changes: 20 additions & 7 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1209,7 +1209,8 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
}

// Look through ExtractLastElement (BuildVector ....).
if (match(&R, m_ExtractLastElement(m_BuildVector()))) {
if (match(&R, m_CombineOr(m_ExtractLastElement(m_BuildVector()),
m_ExtractLastLanePerPart(m_BuildVector())))) {
auto *BuildVector = cast<VPInstruction>(R.getOperand(0));
Def->replaceAllUsesWith(
BuildVector->getOperand(BuildVector->getNumOperands() - 1));
Expand Down Expand Up @@ -1275,20 +1276,28 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
return;
}

if (match(Def, m_ExtractLastElement(m_Broadcast(m_VPValue(A))))) {
if (match(Def,
m_CombineOr(m_ExtractLastElement(m_Broadcast(m_VPValue(A))),
m_ExtractLastLanePerPart(m_Broadcast(m_VPValue(A)))))) {
Def->replaceAllUsesWith(A);
return;
}

if (match(Def,
m_VPInstruction<VPInstruction::ExtractLastElement>(m_VPValue(A))) &&
if (match(Def, m_CombineOr(m_ExtractLastElement(m_VPValue(A)),
m_ExtractLastLanePerPart(m_VPValue(A)))) &&
((isa<VPInstruction>(A) && vputils::isSingleScalar(A)) ||
(isa<VPReplicateRecipe>(A) &&
cast<VPReplicateRecipe>(A)->isSingleScalar())) &&
all_of(A->users(),
[Def, A](VPUser *U) { return U->usesScalars(A) || Def == U; })) {
return Def->replaceAllUsesWith(A);
}

if (Plan->getUF() == 1 &&
match(Def, m_ExtractLastLanePerPart(m_VPValue(A)))) {
return Def->replaceAllUsesWith(
Builder.createNaryOp(VPInstruction::ExtractLastElement, {A}));
}
}

void VPlanTransforms::simplifyRecipes(VPlan &Plan) {
Expand Down Expand Up @@ -1326,8 +1335,11 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Metadata*/);
Clone->insertBefore(RepOrWidenR);
auto *Ext = new VPInstruction(VPInstruction::ExtractLastElement,
{Clone->getOperand(0)});
unsigned ExtractOpc =
vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1))
? VPInstruction::ExtractLastElement
: VPInstruction::ExtractLastLanePerPart;
auto *Ext = new VPInstruction(ExtractOpc, {Clone->getOperand(0)});
Ext->insertBefore(Clone);
Clone->setOperand(0, Ext);
RepR->eraseFromParent();
Expand All @@ -1341,7 +1353,8 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
!all_of(RepOrWidenR->users(), [RepOrWidenR](const VPUser *U) {
return U->usesScalars(RepOrWidenR) ||
match(cast<VPRecipeBase>(U),
m_ExtractLastElement(m_VPValue()));
m_CombineOr(m_ExtractLastElement(m_VPValue()),
m_ExtractLastLanePerPart(m_VPValue())));
}))
continue;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -153,17 +153,20 @@ define void @uniform_gep_for_replicating_gep(ptr %dst) {
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 2
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <2 x i32> [[STEP_ADD]], zeroinitializer
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <2 x i32> [[VEC_IND]], zeroinitializer
; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <2 x i32> [[STEP_ADD]], zeroinitializer
; CHECK-NEXT: [[TMP8:%.*]] = lshr i32 [[INDEX]], 1
; CHECK-NEXT: [[TMP9:%.*]] = lshr i32 [[TMP2]], 1
; CHECK-NEXT: [[TMP11:%.*]] = zext <2 x i1> [[TMP5]] to <2 x i8>
; CHECK-NEXT: [[TMP6:%.*]] = zext <2 x i1> [[TMP3]] to <2 x i8>
; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP8]] to i64
; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP9]] to i64
; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP14]]
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP15]]
; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i8> [[TMP11]], i32 1
; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i8> [[TMP6]], i32 1
; CHECK-NEXT: store i8 [[TMP22]], ptr [[TMP18]], align 1
; CHECK-NEXT: store i8 [[TMP22]], ptr [[TMP19]], align 1
; CHECK-NEXT: store i8 [[TMP12]], ptr [[TMP19]], align 1
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], splat (i32 2)
; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128
Expand Down
9 changes: 5 additions & 4 deletions llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,7 @@ exit:
ret void
}

; FIXME: Currently this mis-compiled when interleaving; all stores store the
; last lane of the last part, instead of the last lane per part.
; Check each unrolled store stores the last lane of the corresponding part.
; Test case for https://github.com/llvm/llvm-project/issues/162498.
define void @narrow_to_single_scalar_store_address_not_uniform_across_all_parts(ptr %dst) {
; VF4IC1-LABEL: define void @narrow_to_single_scalar_store_address_not_uniform_across_all_parts(
Expand Down Expand Up @@ -121,13 +120,15 @@ define void @narrow_to_single_scalar_store_address_not_uniform_across_all_parts(
; VF2IC2-NEXT: br label %[[VECTOR_BODY:.*]]
; VF2IC2: [[VECTOR_BODY]]:
; VF2IC2-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; VF2IC2-NEXT: [[TMP7:%.*]] = add i32 [[INDEX]], 0
; VF2IC2-NEXT: [[TMP8:%.*]] = add i32 [[INDEX]], 1
; VF2IC2-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 2
; VF2IC2-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 3
; VF2IC2-NEXT: [[TMP2:%.*]] = lshr i32 [[INDEX]], 1
; VF2IC2-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP7]], 1
; VF2IC2-NEXT: [[TMP3:%.*]] = lshr i32 [[TMP0]], 1
; VF2IC2-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[DST]], i32 [[TMP2]]
; VF2IC2-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[DST]], i32 [[TMP3]]
; VF2IC2-NEXT: store i32 [[TMP1]], ptr [[TMP4]], align 4
; VF2IC2-NEXT: store i32 [[TMP8]], ptr [[TMP4]], align 4
; VF2IC2-NEXT: store i32 [[TMP1]], ptr [[TMP5]], align 4
; VF2IC2-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; VF2IC2-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 100
Expand Down
Loading