Skip to content

Commit 82a77c1

Browse files
author
git apple-llvm automerger
committed
Merge commit '4e69258bf390' from llvm.org/main into next
2 parents 010951f + 4e69258 commit 82a77c1

11 files changed

+473
-354
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4603,8 +4603,31 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
46034603
continue;
46044604

46054605
InstructionCost C = CM.expectedCost(VF);
4606-
VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
46074606

4607+
// Add on other costs that are modelled in VPlan, but not in the legacy
4608+
// cost model.
4609+
VPCostContext CostCtx(CM.TTI, *CM.TLI, CM.Legal->getWidestInductionType(),
4610+
CM, CM.CostKind);
4611+
VPRegionBlock *VectorRegion = P->getVectorLoopRegion();
4612+
assert(VectorRegion && "Expected to have a vector region!");
4613+
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4614+
vp_depth_first_shallow(VectorRegion->getEntry()))) {
4615+
for (VPRecipeBase &R : *VPBB) {
4616+
auto *VPI = dyn_cast<VPInstruction>(&R);
4617+
if (!VPI)
4618+
continue;
4619+
switch (VPI->getOpcode()) {
4620+
case VPInstruction::ActiveLaneMask:
4621+
case VPInstruction::ExplicitVectorLength:
4622+
C += VPI->cost(VF, CostCtx);
4623+
break;
4624+
default:
4625+
break;
4626+
}
4627+
}
4628+
}
4629+
4630+
VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
46084631
unsigned Width =
46094632
getEstimatedRuntimeVF(Candidate.Width, CM.getVScaleForTuning());
46104633
LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -801,6 +801,21 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
801801
cast<VectorType>(VectorTy), Mask,
802802
Ctx.CostKind, VF.getKnownMinValue() - 1);
803803
}
804+
case VPInstruction::ActiveLaneMask: {
805+
Type *ArgTy = Ctx.Types.inferScalarType(getOperand(0));
806+
Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF);
807+
IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy,
808+
{ArgTy, ArgTy});
809+
return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
810+
}
811+
case VPInstruction::ExplicitVectorLength: {
812+
Type *Arg0Ty = Ctx.Types.inferScalarType(getOperand(0));
813+
Type *I32Ty = Type::getInt32Ty(Ctx.LLVMCtx);
814+
Type *I1Ty = Type::getInt1Ty(Ctx.LLVMCtx);
815+
IntrinsicCostAttributes Attrs(Intrinsic::experimental_get_vector_length,
816+
I32Ty, {Arg0Ty, I32Ty, I1Ty});
817+
return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
818+
}
804819
default:
805820
// TODO: Compute cost other VPInstructions once the legacy cost model has
806821
// been retired.

llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll

Lines changed: 35 additions & 134 deletions
Large diffs are not rendered by default.

llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -99,49 +99,49 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
9999
; PRED-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
100100
; PRED: [[VECTOR_MEMCHECK]]:
101101
; PRED-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
102-
; PRED-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 8
102+
; PRED-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 16
103103
; PRED-NEXT: [[TMP3:%.*]] = sub i64 [[DST1]], [[SRC2]]
104104
; PRED-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
105105
; PRED-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
106106
; PRED: [[VECTOR_PH]]:
107107
; PRED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
108-
; PRED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
108+
; PRED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16
109109
; PRED-NEXT: [[TMP8:%.*]] = sub i64 [[TMP5]], 1
110110
; PRED-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], [[TMP8]]
111111
; PRED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
112112
; PRED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
113113
; PRED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
114-
; PRED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 8
115-
; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[X]], i64 0
116-
; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
114+
; PRED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 16
115+
; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[X]], i64 0
116+
; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
117117
; PRED-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
118-
; PRED-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 8
118+
; PRED-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 16
119119
; PRED-NEXT: [[TMP13:%.*]] = sub i64 [[TMP0]], [[TMP12]]
120120
; PRED-NEXT: [[TMP14:%.*]] = icmp ugt i64 [[TMP0]], [[TMP12]]
121121
; PRED-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0
122-
; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[TMP0]])
123-
; PRED-NEXT: [[TMP16:%.*]] = trunc <vscale x 8 x i32> [[BROADCAST_SPLAT]] to <vscale x 8 x i16>
122+
; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[TMP0]])
123+
; PRED-NEXT: [[TMP16:%.*]] = trunc <vscale x 16 x i32> [[BROADCAST_SPLAT]] to <vscale x 16 x i16>
124124
; PRED-NEXT: br label %[[VECTOR_BODY:.*]]
125125
; PRED: [[VECTOR_BODY]]:
126126
; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
127-
; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
127+
; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
128128
; PRED-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 0
129129
; PRED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP17]]
130130
; PRED-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[TMP18]], i32 0
131-
; PRED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP19]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i8> poison)
132-
; PRED-NEXT: [[TMP20:%.*]] = zext <vscale x 8 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 8 x i16>
133-
; PRED-NEXT: [[TMP21:%.*]] = mul <vscale x 8 x i16> [[TMP20]], [[TMP16]]
134-
; PRED-NEXT: [[TMP22:%.*]] = zext <vscale x 8 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 8 x i16>
135-
; PRED-NEXT: [[TMP23:%.*]] = or <vscale x 8 x i16> [[TMP21]], [[TMP22]]
136-
; PRED-NEXT: [[TMP24:%.*]] = lshr <vscale x 8 x i16> [[TMP23]], trunc (<vscale x 8 x i32> splat (i32 1) to <vscale x 8 x i16>)
137-
; PRED-NEXT: [[TMP25:%.*]] = trunc <vscale x 8 x i16> [[TMP24]] to <vscale x 8 x i8>
131+
; PRED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP19]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
132+
; PRED-NEXT: [[TMP24:%.*]] = zext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i16>
133+
; PRED-NEXT: [[TMP25:%.*]] = mul <vscale x 16 x i16> [[TMP24]], [[TMP16]]
134+
; PRED-NEXT: [[TMP20:%.*]] = zext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i16>
135+
; PRED-NEXT: [[TMP21:%.*]] = or <vscale x 16 x i16> [[TMP25]], [[TMP20]]
136+
; PRED-NEXT: [[TMP22:%.*]] = lshr <vscale x 16 x i16> [[TMP21]], trunc (<vscale x 16 x i32> splat (i32 1) to <vscale x 16 x i16>)
137+
; PRED-NEXT: [[TMP23:%.*]] = trunc <vscale x 16 x i16> [[TMP22]] to <vscale x 16 x i8>
138138
; PRED-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP17]]
139139
; PRED-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[TMP26]], i32 0
140-
; PRED-NEXT: call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP25]], ptr [[TMP27]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
140+
; PRED-NEXT: call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP23]], ptr [[TMP27]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
141141
; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
142-
; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP15]])
143-
; PRED-NEXT: [[TMP28:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
144-
; PRED-NEXT: [[TMP29:%.*]] = extractelement <vscale x 8 x i1> [[TMP28]], i32 0
142+
; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP15]])
143+
; PRED-NEXT: [[TMP28:%.*]] = xor <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
144+
; PRED-NEXT: [[TMP29:%.*]] = extractelement <vscale x 16 x i1> [[TMP28]], i32 0
145145
; PRED-NEXT: br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
146146
; PRED: [[MIDDLE_BLOCK]]:
147147
; PRED-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]

0 commit comments

Comments
 (0)