Skip to content

Commit 28e4d5c

Browse files
committed
[LV][EVL] Support cast instruction with EVL-vectorization
1 parent 4a3f46d commit 28e4d5c

File tree

7 files changed

+366
-14
lines changed

7 files changed

+366
-14
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4516,6 +4516,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
45164516
case VPDef::VPWidenCallSC:
45174517
case VPDef::VPWidenCanonicalIVSC:
45184518
case VPDef::VPWidenCastSC:
4519+
case VPDef::VPWidenCastEVLSC:
45194520
case VPDef::VPWidenGEPSC:
45204521
case VPDef::VPWidenIntrinsicSC:
45214522
case VPDef::VPWidenSC:

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 72 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -866,6 +866,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
866866
case VPRecipeBase::VPWidenCallSC:
867867
case VPRecipeBase::VPWidenCanonicalIVSC:
868868
case VPRecipeBase::VPWidenCastSC:
869+
case VPRecipeBase::VPWidenCastEVLSC:
869870
case VPRecipeBase::VPWidenGEPSC:
870871
case VPRecipeBase::VPWidenIntrinsicSC:
871872
case VPRecipeBase::VPWidenSC:
@@ -1063,6 +1064,7 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe {
10631064
R->getVPDefID() == VPRecipeBase::VPWidenEVLSC ||
10641065
R->getVPDefID() == VPRecipeBase::VPWidenGEPSC ||
10651066
R->getVPDefID() == VPRecipeBase::VPWidenCastSC ||
1067+
R->getVPDefID() == VPRecipeBase::VPWidenCastEVLSC ||
10661068
R->getVPDefID() == VPRecipeBase::VPReplicateSC ||
10671069
R->getVPDefID() == VPRecipeBase::VPReverseVectorPointerSC ||
10681070
R->getVPDefID() == VPRecipeBase::VPVectorPointerSC;
@@ -1542,19 +1544,28 @@ class VPWidenCastRecipe : public VPRecipeWithIRFlags {
15421544
/// Result type for the cast.
15431545
Type *ResultTy;
15441546

1545-
public:
1546-
VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy,
1547-
CastInst &UI)
1547+
protected:
1548+
VPWidenCastRecipe(unsigned VPDefOpcode, Instruction::CastOps Opcode,
1549+
VPValue *Op, Type *ResultTy, CastInst &UI)
15481550
: VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op, UI), Opcode(Opcode),
15491551
ResultTy(ResultTy) {
15501552
assert(UI.getOpcode() == Opcode &&
15511553
"opcode of underlying cast doesn't match");
15521554
}
15531555

1554-
VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy)
1556+
VPWidenCastRecipe(unsigned VPDefOpcode, Instruction::CastOps Opcode,
1557+
VPValue *Op, Type *ResultTy)
15551558
: VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op), Opcode(Opcode),
15561559
ResultTy(ResultTy) {}
15571560

1561+
public:
1562+
VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy,
1563+
CastInst &UI)
1564+
: VPWidenCastRecipe(VPDef::VPWidenCastSC, Opcode, Op, ResultTy, UI) {}
1565+
1566+
VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy)
1567+
: VPWidenCastRecipe(VPDef::VPWidenCastSC, Opcode, Op, ResultTy) {}
1568+
15581569
~VPWidenCastRecipe() override = default;
15591570

15601571
VPWidenCastRecipe *clone() override {
@@ -1565,7 +1576,15 @@ class VPWidenCastRecipe : public VPRecipeWithIRFlags {
15651576
return new VPWidenCastRecipe(Opcode, getOperand(0), ResultTy);
15661577
}
15671578

1568-
VP_CLASSOF_IMPL(VPDef::VPWidenCastSC)
1579+
static inline bool classof(const VPRecipeBase *R) {
1580+
return R->getVPDefID() == VPRecipeBase::VPWidenCastSC ||
1581+
R->getVPDefID() == VPRecipeBase::VPWidenCastEVLSC;
1582+
}
1583+
1584+
static inline bool classof(const VPUser *U) {
1585+
auto *R = dyn_cast<VPRecipeBase>(U);
1586+
return R && classof(R);
1587+
}
15691588

15701589
/// Produce widened copies of the cast.
15711590
void execute(VPTransformState &State) override;
@@ -1586,6 +1605,54 @@ class VPWidenCastRecipe : public VPRecipeWithIRFlags {
15861605
Type *getResultType() const { return ResultTy; }
15871606
};
15881607

1608+
// A recipe for widening cast operation with vector-predication intrinsics with
1609+
/// explicit vector length (EVL).
1610+
class VPWidenCastEVLRecipe : public VPWidenCastRecipe {
1611+
using VPRecipeWithIRFlags::transferFlags;
1612+
1613+
public:
1614+
VPWidenCastEVLRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy,
1615+
VPValue &EVL)
1616+
: VPWidenCastRecipe(VPDef::VPWidenCastEVLSC, Opcode, Op, ResultTy) {
1617+
addOperand(&EVL);
1618+
}
1619+
VPWidenCastEVLRecipe(VPWidenCastRecipe &W, VPValue &EVL)
1620+
: VPWidenCastEVLRecipe(W.getOpcode(), W.getOperand(0), W.getResultType(),
1621+
EVL) {
1622+
transferFlags(W);
1623+
}
1624+
1625+
~VPWidenCastEVLRecipe() override = default;
1626+
1627+
VPWidenCastEVLRecipe *clone() final {
1628+
llvm_unreachable("VPWidenEVLRecipe cannot be cloned");
1629+
return nullptr;
1630+
}
1631+
1632+
VP_CLASSOF_IMPL(VPDef::VPWidenCastEVLSC)
1633+
1634+
VPValue *getEVL() { return getOperand(getNumOperands() - 1); }
1635+
const VPValue *getEVL() const { return getOperand(getNumOperands() - 1); }
1636+
1637+
/// Produce a vp-intrinsic copies of the cast.
1638+
void execute(VPTransformState &State) final;
1639+
1640+
/// Returns true if the recipe only uses the first lane of operand \p Op.
1641+
bool onlyFirstLaneUsed(const VPValue *Op) const override {
1642+
assert(is_contained(operands(), Op) &&
1643+
"Op must be an operand of the recipe");
1644+
// EVL in that recipe is always the last operand, thus any use before means
1645+
// the VPValue should be vectorized.
1646+
return getEVL() == Op;
1647+
}
1648+
1649+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1650+
/// Print the recipe.
1651+
void print(raw_ostream &O, const Twine &Indent,
1652+
VPSlotTracker &SlotTracker) const final;
1653+
#endif
1654+
};
1655+
15891656
/// VPScalarCastRecipe is a recipe to create scalar cast instructions.
15901657
class VPScalarCastRecipe : public VPSingleDefRecipe {
15911658
Instruction::CastOps Opcode;

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ bool VPRecipeBase::mayWriteToMemory() const {
9292
case VPVectorPointerSC:
9393
case VPWidenCanonicalIVSC:
9494
case VPWidenCastSC:
95+
case VPWidenCastEVLSC:
9596
case VPWidenGEPSC:
9697
case VPWidenIntOrFpInductionSC:
9798
case VPWidenLoadEVLSC:
@@ -138,6 +139,7 @@ bool VPRecipeBase::mayReadFromMemory() const {
138139
case VPVectorPointerSC:
139140
case VPWidenCanonicalIVSC:
140141
case VPWidenCastSC:
142+
case VPWidenCastEVLSC:
141143
case VPWidenGEPSC:
142144
case VPWidenIntOrFpInductionSC:
143145
case VPWidenPHISC:
@@ -178,6 +180,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
178180
case VPVectorPointerSC:
179181
case VPWidenCanonicalIVSC:
180182
case VPWidenCastSC:
183+
case VPWidenCastEVLSC:
181184
case VPWidenGEPSC:
182185
case VPWidenIntOrFpInductionSC:
183186
case VPWidenPHISC:
@@ -1554,6 +1557,40 @@ InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF,
15541557
dyn_cast_if_present<Instruction>(getUnderlyingValue()));
15551558
}
15561559

1560+
void VPWidenCastEVLRecipe::execute(VPTransformState &State) {
1561+
unsigned Opcode = getOpcode();
1562+
State.setDebugLocFrom(getDebugLoc());
1563+
assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
1564+
"explicit vector length.");
1565+
1566+
assert(State.get(getOperand(0), 0)->getType()->isVectorTy() &&
1567+
"VPWidenCastEVLRecipe should not be used for scalars");
1568+
1569+
// TODO: add more cast instruction, eg: fptoint/inttofp/inttoptr/fptofp
1570+
if (Opcode == Instruction::SExt || Opcode == Instruction::ZExt ||
1571+
Opcode == Instruction::Trunc) {
1572+
Value *SrcVal = State.get(getOperand(0), 0);
1573+
VectorType *SrcTy = cast<VectorType>(SrcVal->getType());
1574+
VectorType *DsType =
1575+
VectorType::get(getResultType(), SrcTy->getElementCount());
1576+
1577+
IRBuilderBase &BuilderIR = State.Builder;
1578+
VectorBuilder Builder(BuilderIR);
1579+
Value *Mask = BuilderIR.CreateVectorSplat(State.VF, BuilderIR.getTrue());
1580+
1581+
Builder.setMask(Mask).setEVL(State.get(getEVL(), 0, /*NeedsScalar=*/true));
1582+
Value *VPInst =
1583+
Builder.createVectorInstruction(Opcode, DsType, {SrcVal}, "vp.cast");
1584+
if (VPInst) {
1585+
if (auto *VecOp = dyn_cast<CastInst>(VPInst))
1586+
VecOp->copyIRFlags(getUnderlyingInstr());
1587+
}
1588+
State.set(this, VPInst, 0);
1589+
State.addMetadata(VPInst,
1590+
dyn_cast_or_null<Instruction>(getUnderlyingValue()));
1591+
}
1592+
}
1593+
15571594
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
15581595
void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,
15591596
VPSlotTracker &SlotTracker) const {
@@ -1564,6 +1601,16 @@ void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,
15641601
printOperands(O, SlotTracker);
15651602
O << " to " << *getResultType();
15661603
}
1604+
1605+
void VPWidenCastEVLRecipe::print(raw_ostream &O, const Twine &Indent,
1606+
VPSlotTracker &SlotTracker) const {
1607+
O << Indent << "WIDEN-VP ";
1608+
printAsOperand(O, SlotTracker);
1609+
O << " = vp." << Instruction::getOpcodeName(getOpcode()) << " ";
1610+
printFlags(O);
1611+
printOperands(O, SlotTracker);
1612+
O << " to " << *getResultType();
1613+
}
15671614
#endif
15681615

15691616
InstructionCost VPHeaderPHIRecipe::computeCost(ElementCount VF,

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1477,6 +1477,15 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
14771477
return nullptr;
14781478
return new VPWidenEVLRecipe(*W, EVL);
14791479
})
1480+
.Case<VPWidenCastRecipe>(
1481+
[&](VPWidenCastRecipe *W) -> VPRecipeBase * {
1482+
unsigned Opcode = W->getOpcode();
1483+
if (Opcode != Instruction::SExt &&
1484+
Opcode != Instruction::ZExt &&
1485+
Opcode != Instruction::Trunc)
1486+
return nullptr;
1487+
return new VPWidenCastEVLRecipe(*W, EVL);
1488+
})
14801489
.Case<VPReductionRecipe>([&](VPReductionRecipe *Red) {
14811490
VPValue *NewMask = GetNewMask(Red->getCondOp());
14821491
return new VPReductionEVLRecipe(*Red, EVL, NewMask);

llvm/lib/Transforms/Vectorize/VPlanValue.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -337,6 +337,7 @@ class VPDef {
337337
VPWidenCallSC,
338338
VPWidenCanonicalIVSC,
339339
VPWidenCastSC,
340+
VPWidenCastEVLSC,
340341
VPWidenGEPSC,
341342
VPWidenIntrinsicSC,
342343
VPWidenLoadEVLSC,

llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -194,16 +194,16 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
194194
; IF-EVL-INLOOP-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
195195
; IF-EVL-INLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
196196
; IF-EVL-INLOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
197-
; IF-EVL-INLOOP-NEXT: [[AVL:%.*]] = sub i32 [[N]], [[EVL_BASED_IV]]
198-
; IF-EVL-INLOOP-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 8, i1 true)
199-
; IF-EVL-INLOOP-NEXT: [[TMP6:%.*]] = add i32 [[EVL_BASED_IV]], 0
200-
; IF-EVL-INLOOP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP6]]
201-
; IF-EVL-INLOOP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[TMP7]], i32 0
202-
; IF-EVL-INLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 8 x i16> @llvm.vp.load.nxv8i16.p0(ptr align 2 [[TMP8]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP5]])
203-
; IF-EVL-INLOOP-NEXT: [[TMP9:%.*]] = sext <vscale x 8 x i16> [[VP_OP_LOAD]] to <vscale x 8 x i32>
204-
; IF-EVL-INLOOP-NEXT: [[TMP10:%.*]] = call i32 @llvm.vp.reduce.add.nxv8i32(i32 0, <vscale x 8 x i32> [[TMP9]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP5]])
197+
; IF-EVL-INLOOP-NEXT: [[TMP5:%.*]] = sub i32 [[N]], [[EVL_BASED_IV]]
198+
; IF-EVL-INLOOP-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[TMP5]], i32 8, i1 true)
199+
; IF-EVL-INLOOP-NEXT: [[TMP7:%.*]] = add i32 [[EVL_BASED_IV]], 0
200+
; IF-EVL-INLOOP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP7]]
201+
; IF-EVL-INLOOP-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[TMP8]], i32 0
202+
; IF-EVL-INLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 8 x i16> @llvm.vp.load.nxv8i16.p0(ptr align 2 [[TMP9]], <vscale x 8 x i1> shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer), i32 [[TMP6]])
203+
; IF-EVL-INLOOP-NEXT: [[VP_CAST:%.*]] = call <vscale x 8 x i32> @llvm.vp.sext.nxv8i32.nxv8i16(<vscale x 8 x i16> [[VP_OP_LOAD]], <vscale x 8 x i1> shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer), i32 [[TMP6]])
204+
; IF-EVL-INLOOP-NEXT: [[TMP10:%.*]] = call i32 @llvm.vp.reduce.add.nxv8i32(i32 0, <vscale x 8 x i32> [[VP_CAST]], <vscale x 8 x i1> shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer), i32 [[TMP6]])
205205
; IF-EVL-INLOOP-NEXT: [[TMP11]] = add i32 [[TMP10]], [[VEC_PHI]]
206-
; IF-EVL-INLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i32 [[TMP5]], [[EVL_BASED_IV]]
206+
; IF-EVL-INLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i32 [[TMP6]], [[EVL_BASED_IV]]
207207
; IF-EVL-INLOOP-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP4]]
208208
; IF-EVL-INLOOP-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
209209
; IF-EVL-INLOOP-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]

0 commit comments

Comments
 (0)