Skip to content

Commit a9f4ddc

Browse files
committed
[ExpandVectorPredication] Expand vp.load.ff.
Use masked load with only the first element possible active. Return 1 for the number of elements processed since EVL gets converted to mask first.
1 parent 58c41b7 commit a9f4ddc

File tree

2 files changed

+134
-0
lines changed

2 files changed

+134
-0
lines changed

llvm/lib/CodeGen/ExpandVectorPredication.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -465,6 +465,21 @@ CachingVPExpander::expandPredicationInMemoryIntrinsic(IRBuilder<> &Builder,
465465
VPI.getName());
466466
break;
467467
}
468+
case Intrinsic::vp_load_ff: {
469+
// Mask out all but the first lane.
470+
Value *NewMask = ConstantInt::getFalse(MaskParam->getType());
471+
NewMask = Builder.CreateInsertElement(
472+
NewMask, ConstantInt::getTrue(MaskParam->getType()->getScalarType()),
473+
(uint64_t)0);
474+
NewMask = Builder.CreateAnd(MaskParam, NewMask);
475+
Value *MaskedLoad = Builder.CreateMaskedLoad(
476+
VPI.getType()->subtypes()[0], PtrParam, AlignOpt.valueOrOne(), NewMask);
477+
Value *EVLResult = Builder.getInt32(1);
478+
Value *InsertValue = Builder.CreateInsertValue(
479+
PoisonValue::get(VPI.getType()), MaskedLoad, 0);
480+
NewMemoryInst = Builder.CreateInsertValue(InsertValue, EVLResult, 1);
481+
break;
482+
}
468483
}
469484

470485
assert(NewMemoryInst);
@@ -609,6 +624,7 @@ Value *CachingVPExpander::expandPredication(VPIntrinsic &VPI) {
609624
case Intrinsic::vp_store:
610625
case Intrinsic::vp_gather:
611626
case Intrinsic::vp_scatter:
627+
case Intrinsic::vp_load_ff:
612628
return expandPredicationInMemoryIntrinsic(Builder, VPI);
613629
}
614630

llvm/test/Transforms/PreISelIntrinsicLowering/expand-vp-load-store.ll

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,61 @@ define void @vpstore_v2i64_allones_mask_vlmax(<2 x i64> %val, ptr %ptr) {
9191
ret void
9292
}
9393

94+
define { <2 x i64>, i32 } @vpload_ff_v2i64(ptr %ptr, <2 x i1> %m, i32 zeroext %evl) {
95+
; CHECK-LABEL: @vpload_ff_v2i64(
96+
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EVL:%.*]], i64 0
97+
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
98+
; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <2 x i32> <i32 0, i32 1>, [[DOTSPLAT]]
99+
; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i1> [[TMP1]], [[M:%.*]]
100+
; CHECK-NEXT: [[TMP3:%.*]] = and <2 x i1> [[TMP2]], <i1 true, i1 false>
101+
; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr [[PTR:%.*]], i32 1, <2 x i1> [[TMP3]], <2 x i64> poison)
102+
; CHECK-NEXT: [[TMP5:%.*]] = insertvalue { <2 x i64>, i32 } poison, <2 x i64> [[TMP4]], 0
103+
; CHECK-NEXT: [[TMP6:%.*]] = insertvalue { <2 x i64>, i32 } [[TMP5]], i32 1, 1
104+
; CHECK-NEXT: ret { <2 x i64>, i32 } [[TMP6]]
105+
;
106+
%load = call { <2 x i64>, i32 } @llvm.vp.load.ff.v2i64.p0(ptr %ptr, <2 x i1> %m, i32 %evl)
107+
ret { <2 x i64>, i32 } %load
108+
}
109+
110+
define { <2 x i64>, i32 } @vpload_ff_v2i64_vlmax(ptr %ptr, <2 x i1> %m) {
111+
; CHECK-LABEL: @vpload_ff_v2i64_vlmax(
112+
; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i1> [[M:%.*]], <i1 true, i1 false>
113+
; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr [[PTR:%.*]], i32 1, <2 x i1> [[TMP1]], <2 x i64> poison)
114+
; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { <2 x i64>, i32 } poison, <2 x i64> [[TMP2]], 0
115+
; CHECK-NEXT: [[TMP4:%.*]] = insertvalue { <2 x i64>, i32 } [[TMP3]], i32 1, 1
116+
; CHECK-NEXT: ret { <2 x i64>, i32 } [[TMP4]]
117+
;
118+
%load = call { <2 x i64>, i32 } @llvm.vp.load.ff.v2i64.p0(ptr %ptr, <2 x i1> %m, i32 2)
119+
ret { <2 x i64>, i32 } %load
120+
}
121+
122+
define { <2 x i64>, i32 } @vpload_ff_v2i64_allones_mask(ptr %ptr, i32 zeroext %evl) {
123+
; CHECK-LABEL: @vpload_ff_v2i64_allones_mask(
124+
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EVL:%.*]], i64 0
125+
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
126+
; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <2 x i32> <i32 0, i32 1>, [[DOTSPLAT]]
127+
; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i1> [[TMP1]], splat (i1 true)
128+
; CHECK-NEXT: [[TMP3:%.*]] = and <2 x i1> [[TMP2]], <i1 true, i1 false>
129+
; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr [[PTR:%.*]], i32 1, <2 x i1> [[TMP3]], <2 x i64> poison)
130+
; CHECK-NEXT: [[TMP5:%.*]] = insertvalue { <2 x i64>, i32 } poison, <2 x i64> [[TMP4]], 0
131+
; CHECK-NEXT: [[TMP6:%.*]] = insertvalue { <2 x i64>, i32 } [[TMP5]], i32 1, 1
132+
; CHECK-NEXT: ret { <2 x i64>, i32 } [[TMP6]]
133+
;
134+
%load = call { <2 x i64>, i32 } @llvm.vp.load.ff.v2i64.p0(ptr %ptr, <2 x i1> <i1 1, i1 1>, i32 %evl)
135+
ret { <2 x i64>, i32 } %load
136+
}
137+
138+
define { <2 x i64>, i32 } @vpload_ff_v2i64_allones_mask_vlmax(ptr %ptr) {
139+
; CHECK-LABEL: @vpload_ff_v2i64_allones_mask_vlmax(
140+
; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr [[PTR:%.*]], i32 1, <2 x i1> <i1 true, i1 false>, <2 x i64> poison)
141+
; CHECK-NEXT: [[TMP2:%.*]] = insertvalue { <2 x i64>, i32 } poison, <2 x i64> [[TMP1]], 0
142+
; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { <2 x i64>, i32 } [[TMP2]], i32 1, 1
143+
; CHECK-NEXT: ret { <2 x i64>, i32 } [[TMP3]]
144+
;
145+
%load = call { <2 x i64>, i32 } @llvm.vp.load.ff.v2i64.p0(ptr %ptr, <2 x i1> <i1 1, i1 1>, i32 2)
146+
ret { <2 x i64>, i32 } %load
147+
}
148+
94149
; Scalable vectors
95150
define <vscale x 1 x i64> @vpload_nxv1i64(ptr %ptr, <vscale x 1 x i1> %m, i32 zeroext %evl) {
96151
; CHECK-LABEL: @vpload_nxv1i64(
@@ -196,6 +251,69 @@ define void @vpstore_nxv1i64_allones_mask_vscale(<vscale x 1 x i64> %val, ptr %p
196251
ret void
197252
}
198253

254+
define { <vscale x 1 x i64>, i32 } @vpload_ff_nxv1i64(ptr %ptr, <vscale x 1 x i1> %m, i32 zeroext %evl) {
255+
; CHECK-LABEL: @vpload_ff_nxv1i64(
256+
; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i32(i32 0, i32 [[EVL:%.*]])
257+
; CHECK-NEXT: [[TMP2:%.*]] = and <vscale x 1 x i1> [[TMP1]], [[M:%.*]]
258+
; CHECK-NEXT: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
259+
; CHECK-NEXT: [[SCALABLE_SIZE:%.*]] = mul nuw i32 [[VSCALE]], 1
260+
; CHECK-NEXT: [[TMP6:%.*]] = and <vscale x 1 x i1> [[TMP2]], insertelement (<vscale x 1 x i1> zeroinitializer, i1 true, i64 0)
261+
; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0(ptr [[PTR:%.*]], i32 1, <vscale x 1 x i1> [[TMP6]], <vscale x 1 x i64> poison)
262+
; CHECK-NEXT: [[TMP5:%.*]] = insertvalue { <vscale x 1 x i64>, i32 } poison, <vscale x 1 x i64> [[TMP4]], 0
263+
; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { <vscale x 1 x i64>, i32 } [[TMP5]], i32 1, 1
264+
; CHECK-NEXT: ret { <vscale x 1 x i64>, i32 } [[TMP3]]
265+
;
266+
%load = call { <vscale x 1 x i64>, i32 } @llvm.vp.load.ff.nxv1i64.p0(ptr %ptr, <vscale x 1 x i1> %m, i32 %evl)
267+
ret { <vscale x 1 x i64>, i32 } %load
268+
}
269+
270+
define { <vscale x 1 x i64>, i32 } @vpload_ff_nxv1i64_vscale(ptr %ptr, <vscale x 1 x i1> %m) {
271+
; CHECK-LABEL: @vpload_ff_nxv1i64_vscale(
272+
; CHECK-NEXT: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
273+
; CHECK-NEXT: [[VLMAX:%.*]] = mul nuw i32 [[VSCALE]], 1
274+
; CHECK-NEXT: [[TMP4:%.*]] = and <vscale x 1 x i1> [[M:%.*]], insertelement (<vscale x 1 x i1> zeroinitializer, i1 true, i64 0)
275+
; CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0(ptr [[PTR:%.*]], i32 1, <vscale x 1 x i1> [[TMP4]], <vscale x 1 x i64> poison)
276+
; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { <vscale x 1 x i64>, i32 } poison, <vscale x 1 x i64> [[TMP2]], 0
277+
; CHECK-NEXT: [[TMP1:%.*]] = insertvalue { <vscale x 1 x i64>, i32 } [[TMP3]], i32 1, 1
278+
; CHECK-NEXT: ret { <vscale x 1 x i64>, i32 } [[TMP1]]
279+
;
280+
%vscale = call i32 @llvm.vscale.i32()
281+
%vlmax = mul nuw i32 %vscale, 1
282+
%load = call { <vscale x 1 x i64>, i32 } @llvm.vp.load.ff.nxv1i64.p0(ptr %ptr, <vscale x 1 x i1> %m, i32 %vlmax)
283+
ret { <vscale x 1 x i64>, i32 } %load
284+
}
285+
286+
define { <vscale x 1 x i64>, i32 } @vpload_ff_nxv1i64_allones_mask(ptr %ptr, i32 zeroext %evl) {
287+
; CHECK-LABEL: @vpload_ff_nxv1i64_allones_mask(
288+
; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i32(i32 0, i32 [[EVL:%.*]])
289+
; CHECK-NEXT: [[TMP2:%.*]] = and <vscale x 1 x i1> [[TMP1]], splat (i1 true)
290+
; CHECK-NEXT: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
291+
; CHECK-NEXT: [[SCALABLE_SIZE:%.*]] = mul nuw i32 [[VSCALE]], 1
292+
; CHECK-NEXT: [[TMP6:%.*]] = and <vscale x 1 x i1> [[TMP2]], insertelement (<vscale x 1 x i1> zeroinitializer, i1 true, i64 0)
293+
; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0(ptr [[PTR:%.*]], i32 1, <vscale x 1 x i1> [[TMP6]], <vscale x 1 x i64> poison)
294+
; CHECK-NEXT: [[TMP5:%.*]] = insertvalue { <vscale x 1 x i64>, i32 } poison, <vscale x 1 x i64> [[TMP4]], 0
295+
; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { <vscale x 1 x i64>, i32 } [[TMP5]], i32 1, 1
296+
; CHECK-NEXT: ret { <vscale x 1 x i64>, i32 } [[TMP3]]
297+
;
298+
%load = call { <vscale x 1 x i64>, i32 } @llvm.vp.load.ff.nxv1i64.p0(ptr %ptr, <vscale x 1 x i1> splat (i1 true), i32 %evl)
299+
ret { <vscale x 1 x i64>, i32 } %load
300+
}
301+
302+
define { <vscale x 1 x i64>, i32 } @vpload_ff_nxv1i64_allones_mask_vscale(ptr %ptr) {
303+
; CHECK-LABEL: @vpload_ff_nxv1i64_allones_mask_vscale(
304+
; CHECK-NEXT: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
305+
; CHECK-NEXT: [[VLMAX:%.*]] = mul nuw i32 [[VSCALE]], 1
306+
; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0(ptr [[PTR:%.*]], i32 1, <vscale x 1 x i1> insertelement (<vscale x 1 x i1> zeroinitializer, i1 true, i64 0), <vscale x 1 x i64> poison)
307+
; CHECK-NEXT: [[TMP2:%.*]] = insertvalue { <vscale x 1 x i64>, i32 } poison, <vscale x 1 x i64> [[TMP3]], 0
308+
; CHECK-NEXT: [[TMP1:%.*]] = insertvalue { <vscale x 1 x i64>, i32 } [[TMP2]], i32 1, 1
309+
; CHECK-NEXT: ret { <vscale x 1 x i64>, i32 } [[TMP1]]
310+
;
311+
%vscale = call i32 @llvm.vscale.i32()
312+
%vlmax = mul nuw i32 %vscale, 1
313+
%load = call { <vscale x 1 x i64>, i32 } @llvm.vp.load.ff.nxv1i64.p0(ptr %ptr, <vscale x 1 x i1> splat (i1 true), i32 %vlmax)
314+
ret { <vscale x 1 x i64>, i32 } %load
315+
}
316+
199317
declare i32 @llvm.vscale.i32()
200318

201319
declare <2 x i64> @llvm.vp.load.v2i64.p0(ptr, <2 x i1>, i32)

0 commit comments

Comments
 (0)