Skip to content

Commit de3de3f

Browse files
[LV] Consider interleaving when -enable-wide-lane-mask=true (llvm#163387)
Currently the only way to enable the use of wide active lane masks is to pass -enable-wide-lane-mask and force both interleaving & tail-folding with additional flags. This patch changes selectInterleaveCount to consider interleaving if wide lane masks were requested, although the feature remains off by default.
1 parent 7a73e69 commit de3de3f

File tree

4 files changed

+121
-17
lines changed

4 files changed

+121
-17
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,11 @@ static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
249249
"Use predicated EVL instructions for tail folding. If EVL "
250250
"is unsupported, fallback to data-without-lane-mask.")));
251251

252+
cl::opt<bool> llvm::EnableWideActiveLaneMask(
253+
"enable-wide-lane-mask", cl::init(false), cl::Hidden,
254+
cl::desc("Enable use of wide lane masks when used for control flow in "
255+
"tail-folded loops"));
256+
252257
static cl::opt<bool> MaximizeBandwidth(
253258
"vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
254259
cl::desc("Maximize bandwidth when selecting vectorization factor which "
@@ -1314,6 +1319,12 @@ class LoopVectorizationCostModel {
13141319
return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
13151320
}
13161321

1322+
/// Returns true if tail-folding is preferred over a scalar epilogue.
1323+
bool preferPredicatedLoop() const {
1324+
return ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate ||
1325+
ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate;
1326+
}
1327+
13171328
/// Returns the TailFoldingStyle that is best for the current loop.
13181329
TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
13191330
if (!ChosenTailFoldingStyle)
@@ -1374,6 +1385,17 @@ class LoopVectorizationCostModel {
13741385
return getTailFoldingStyle() != TailFoldingStyle::None;
13751386
}
13761387

1388+
/// Returns true if the use of wide lane masks is requested and the loop is
1389+
/// using tail-folding with a lane mask for control flow.
1390+
bool useWideActiveLaneMask() const {
1391+
if (!EnableWideActiveLaneMask)
1392+
return false;
1393+
1394+
TailFoldingStyle TF = getTailFoldingStyle();
1395+
return TF == TailFoldingStyle::DataAndControlFlow ||
1396+
TF == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
1397+
}
1398+
13771399
/// Return maximum safe number of elements to be processed per vector
13781400
/// iteration, which do not prevent store-load forwarding and are safe with
13791401
/// regard to the memory dependencies. Required for EVL-based VPlans to
@@ -4560,7 +4582,12 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF,
45604582
// 3. We don't interleave if we think that we will spill registers to memory
45614583
// due to the increased register pressure.
45624584

4563-
if (!CM.isScalarEpilogueAllowed())
4585+
// Only interleave tail-folded loops if wide lane masks are requested, as the
4586+
// overhead of multiple instructions to calculate the predicate is likely
4587+
// not beneficial. If a scalar epilogue is not allowed for any other reason,
4588+
// do not interleave.
4589+
if (!CM.isScalarEpilogueAllowed() &&
4590+
!(CM.preferPredicatedLoop() && CM.useWideActiveLaneMask()))
45644591
return 1;
45654592

45664593
if (any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,10 +40,6 @@
4040
using namespace llvm;
4141
using namespace VPlanPatternMatch;
4242

43-
static cl::opt<bool> EnableWideActiveLaneMask(
44-
"enable-wide-lane-mask", cl::init(false), cl::Hidden,
45-
cl::desc("Enable use of wide get active lane mask instructions"));
46-
4743
bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
4844
VPlan &Plan,
4945
function_ref<const InductionDescriptor *(PHINode *)>

llvm/lib/Transforms/Vectorize/VPlanTransforms.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ class VPRecipeBuilder;
3232
struct VFRange;
3333

3434
extern cl::opt<bool> VerifyEachVPlan;
35+
extern cl::opt<bool> EnableWideActiveLaneMask;
3536

3637
struct VPlanTransforms {
3738
/// Helper to run a VPlan transform \p Transform on \p VPlan, forwarding extra

llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll

Lines changed: 92 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
1-
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^middle.block:" --version 4
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^middle.block:" --version 4
22
; RUN: opt -S --passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -enable-wide-lane-mask -force-vector-interleave=1 < %s | FileCheck %s -check-prefix CHECK-UF1
33
; RUN: opt -S --passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -enable-wide-lane-mask -force-vector-interleave=4 < %s | FileCheck %s -check-prefix CHECK-UF4
4+
; RUN: opt -S --passes=loop-vectorize -enable-wide-lane-mask -prefer-predicate-over-epilogue=predicate-dont-vectorize < %s | FileCheck %s -check-prefix CHECK-TF
5+
; RUN: opt -S --passes=forceattrs,loop-vectorize -enable-wide-lane-mask -prefer-predicate-over-epilogue=predicate-dont-vectorize -force-attribute=optsize < %s | FileCheck %s -check-prefix CHECK-UF1
46

57
target triple = "aarch64-unknown-linux"
68

@@ -101,6 +103,49 @@ define void @scalable_wide_active_lane_mask(ptr noalias %dst, ptr readonly %src,
101103
; CHECK-UF4-NEXT: br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
102104
; CHECK-UF4: middle.block:
103105
;
106+
; CHECK-TF-LABEL: define void @scalable_wide_active_lane_mask(
107+
; CHECK-TF-SAME: ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
108+
; CHECK-TF-NEXT: entry:
109+
; CHECK-TF-NEXT: br label [[VECTOR_PH:%.*]]
110+
; CHECK-TF: vector.ph:
111+
; CHECK-TF-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
112+
; CHECK-TF-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 32
113+
; CHECK-TF-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
114+
; CHECK-TF-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 5
115+
; CHECK-TF-NEXT: [[TMP4:%.*]] = sub i64 [[N]], [[TMP3]]
116+
; CHECK-TF-NEXT: [[TMP5:%.*]] = icmp ugt i64 [[N]], [[TMP3]]
117+
; CHECK-TF-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i64 [[TMP4]], i64 0
118+
; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 0, i64 [[N]])
119+
; CHECK-TF-NEXT: [[TMP7:%.*]] = call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 16)
120+
; CHECK-TF-NEXT: [[TMP8:%.*]] = call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 0)
121+
; CHECK-TF-NEXT: br label [[VECTOR_BODY:%.*]]
122+
; CHECK-TF: vector.body:
123+
; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
124+
; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[TMP8]], [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
125+
; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK1:%.*]] = phi <vscale x 16 x i1> [ [[TMP7]], [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
126+
; CHECK-TF-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX]]
127+
; CHECK-TF-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
128+
; CHECK-TF-NEXT: [[TMP11:%.*]] = shl nuw i64 [[TMP10]], 4
129+
; CHECK-TF-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i64 [[TMP11]]
130+
; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP9]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
131+
; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP12]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK1]], <vscale x 16 x i8> poison)
132+
; CHECK-TF-NEXT: [[TMP13:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD]], splat (i8 3)
133+
; CHECK-TF-NEXT: [[TMP14:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD2]], splat (i8 3)
134+
; CHECK-TF-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]]
135+
; CHECK-TF-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
136+
; CHECK-TF-NEXT: [[TMP17:%.*]] = shl nuw i64 [[TMP16]], 4
137+
; CHECK-TF-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i64 [[TMP17]]
138+
; CHECK-TF-NEXT: call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP13]], ptr align 1 [[TMP15]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
139+
; CHECK-TF-NEXT: call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP14]], ptr align 1 [[TMP18]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK1]])
140+
; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
141+
; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT:%.*]] = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 [[INDEX]], i64 [[TMP6]])
142+
; CHECK-TF-NEXT: [[TMP19]] = call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 16)
143+
; CHECK-TF-NEXT: [[TMP20]] = call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv32i1(<vscale x 32 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0)
144+
; CHECK-TF-NEXT: [[TMP21:%.*]] = extractelement <vscale x 16 x i1> [[TMP20]], i32 0
145+
; CHECK-TF-NEXT: [[TMP22:%.*]] = xor i1 [[TMP21]], true
146+
; CHECK-TF-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
147+
; CHECK-TF: middle.block:
148+
;
104149
entry:
105150
br label %for.body
106151

@@ -222,6 +267,52 @@ define void @scalable_wide_active_lane_mask_double(ptr noalias %dst, ptr readonl
222267
; CHECK-UF4-NEXT: br i1 [[TMP55]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
223268
; CHECK-UF4: middle.block:
224269
;
270+
; CHECK-TF-LABEL: define void @scalable_wide_active_lane_mask_double(
271+
; CHECK-TF-SAME: ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
272+
; CHECK-TF-NEXT: entry:
273+
; CHECK-TF-NEXT: [[CMP6:%.*]] = icmp sgt i64 [[N]], 0
274+
; CHECK-TF-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
275+
; CHECK-TF: for.body.preheader:
276+
; CHECK-TF-NEXT: br label [[VECTOR_PH:%.*]]
277+
; CHECK-TF: vector.ph:
278+
; CHECK-TF-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
279+
; CHECK-TF-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4
280+
; CHECK-TF-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
281+
; CHECK-TF-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
282+
; CHECK-TF-NEXT: [[TMP4:%.*]] = sub i64 [[N]], [[TMP3]]
283+
; CHECK-TF-NEXT: [[TMP5:%.*]] = icmp ugt i64 [[N]], [[TMP3]]
284+
; CHECK-TF-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i64 [[TMP4]], i64 0
285+
; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
286+
; CHECK-TF-NEXT: [[TMP7:%.*]] = call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 2)
287+
; CHECK-TF-NEXT: [[TMP8:%.*]] = call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 0)
288+
; CHECK-TF-NEXT: br label [[VECTOR_BODY:%.*]]
289+
; CHECK-TF: vector.body:
290+
; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
291+
; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[TMP8]], [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
292+
; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK1:%.*]] = phi <vscale x 2 x i1> [ [[TMP7]], [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
293+
; CHECK-TF-NEXT: [[TMP9:%.*]] = getelementptr inbounds double, ptr [[SRC]], i64 [[INDEX]]
294+
; CHECK-TF-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
295+
; CHECK-TF-NEXT: [[TMP11:%.*]] = shl nuw i64 [[TMP10]], 1
296+
; CHECK-TF-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, ptr [[TMP9]], i64 [[TMP11]]
297+
; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP9]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
298+
; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP12]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK1]], <vscale x 2 x double> poison)
299+
; CHECK-TF-NEXT: [[TMP13:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD]], splat (double 3.000000e+00)
300+
; CHECK-TF-NEXT: [[TMP14:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD2]], splat (double 3.000000e+00)
301+
; CHECK-TF-NEXT: [[TMP15:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
302+
; CHECK-TF-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
303+
; CHECK-TF-NEXT: [[TMP17:%.*]] = shl nuw i64 [[TMP16]], 1
304+
; CHECK-TF-NEXT: [[TMP18:%.*]] = getelementptr inbounds double, ptr [[TMP15]], i64 [[TMP17]]
305+
; CHECK-TF-NEXT: call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP13]], ptr align 8 [[TMP15]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
306+
; CHECK-TF-NEXT: call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP14]], ptr align 8 [[TMP18]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK1]])
307+
; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
308+
; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP6]])
309+
; CHECK-TF-NEXT: [[TMP19]] = call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 2)
310+
; CHECK-TF-NEXT: [[TMP20]] = call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv4i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0)
311+
; CHECK-TF-NEXT: [[TMP21:%.*]] = extractelement <vscale x 2 x i1> [[TMP20]], i32 0
312+
; CHECK-TF-NEXT: [[TMP22:%.*]] = xor i1 [[TMP21]], true
313+
; CHECK-TF-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
314+
; CHECK-TF: middle.block:
315+
;
225316
entry:
226317
%cmp6 = icmp sgt i64 %n, 0
227318
br i1 %cmp6, label %for.body, label %for.end
@@ -243,14 +334,3 @@ for.end:
243334

244335
attributes #0 = { nounwind vscale_range(1,16) "target-features"="+sve2p1" }
245336

246-
;.
247-
; CHECK-UF1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
248-
; CHECK-UF1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
249-
; CHECK-UF1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
250-
; CHECK-UF1: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
251-
;.
252-
; CHECK-UF4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
253-
; CHECK-UF4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
254-
; CHECK-UF4: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
255-
; CHECK-UF4: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
256-
;.

0 commit comments

Comments
 (0)