Skip to content

Commit 11ac166

Browse files
committed
[GVN] Teach GVN simple masked load/store forwarding
This patch teaches GVN how to eliminate redundant masked loads and forward previous loads or instructions with a select. This is possible when the same mask is used for masked stores/loads that write to the same memory location
1 parent 82ef4ee commit 11ac166

File tree

3 files changed

+210
-0
lines changed

3 files changed

+210
-0
lines changed

llvm/include/llvm/Transforms/Scalar/GVN.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ class OptimizationRemarkEmitter;
5656
class PHINode;
5757
class TargetLibraryInfo;
5858
class Value;
59+
class IntrinsicInst;
5960
/// A private "module" namespace for types and utilities used by GVN. These
6061
/// are implementation details and should not be used by clients.
6162
namespace LLVM_LIBRARY_VISIBILITY_NAMESPACE gvn {
@@ -349,6 +350,7 @@ class GVNPass : public PassInfoMixin<GVNPass> {
349350

350351
// Helper functions of redundant load elimination.
351352
bool processLoad(LoadInst *L);
353+
bool processMaskedLoad(IntrinsicInst *I);
352354
bool processNonLocalLoad(LoadInst *L);
353355
bool processAssumeIntrinsic(AssumeInst *II);
354356

llvm/lib/Transforms/Scalar/GVN.cpp

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
#include "llvm/IR/DebugLoc.h"
5151
#include "llvm/IR/Dominators.h"
5252
#include "llvm/IR/Function.h"
53+
#include "llvm/IR/IRBuilder.h"
5354
#include "llvm/IR/InstrTypes.h"
5455
#include "llvm/IR/Instruction.h"
5556
#include "llvm/IR/Instructions.h"
@@ -2287,6 +2288,50 @@ bool GVNPass::processLoad(LoadInst *L) {
22872288
return true;
22882289
}
22892290

2291+
// Attempt to process masked loads which have loaded from
2292+
// masked stores with the same mask
2293+
bool GVNPass::processMaskedLoad(IntrinsicInst *I) {
2294+
Value *Mask = I->getOperand(2);
2295+
Value *Passthrough = I->getOperand(3);
2296+
2297+
MemDepResult Dep = MD->getDependency(I);
2298+
Instruction *DepInst = Dep.getInst();
2299+
if (!DepInst || !Dep.isLocal())
2300+
return false;
2301+
2302+
auto *MaskedStore = dyn_cast<IntrinsicInst>(DepInst);
2303+
if (!MaskedStore || MaskedStore->getIntrinsicID() != Intrinsic::masked_store)
2304+
return false;
2305+
2306+
auto StoreMask = MaskedStore->getOperand(3);
2307+
if (StoreMask != Mask)
2308+
return false;
2309+
2310+
Value *OpToForward =
2311+
AvailableValue::get(MaskedStore->getOperand(0)).getSimpleValue();
2312+
if (auto *LoadToForward = dyn_cast<IntrinsicInst>(OpToForward);
2313+
LoadToForward &&
2314+
LoadToForward->getIntrinsicID() == Intrinsic::masked_load) {
2315+
// For MaskedLoad->MaskedStore->MaskedLoad, the mask must be the same for
2316+
// all three instructions. The Passthrough on the two loads must also be the
2317+
// same.
2318+
if (LoadToForward->getOperand(2) != Mask ||
2319+
LoadToForward->getOperand(3) != Passthrough)
2320+
return false;
2321+
} else {
2322+
// MaskedStore(Op, ptr, mask)->MaskedLoad(ptr, mask, passthrough) can be
2323+
// replaced with MaskedStore(Op, ptr, mask)->select(mask, Op, passthrough)
2324+
IRBuilder<> Builder(I);
2325+
OpToForward = Builder.CreateSelect(StoreMask, OpToForward, Passthrough);
2326+
}
2327+
2328+
I->replaceAllUsesWith(OpToForward);
2329+
ICF->removeUsersOf(I);
2330+
salvageAndRemoveInstruction(I);
2331+
++NumGVNLoad;
2332+
return true;
2333+
}
2334+
22902335
/// Return a pair the first field showing the value number of \p Exp and the
22912336
/// second field showing whether it is a value number newly created.
22922337
std::pair<uint32_t, bool>
@@ -2734,6 +2779,11 @@ bool GVNPass::processInstruction(Instruction *I) {
27342779
return false;
27352780
}
27362781

2782+
if (auto *II = dyn_cast<IntrinsicInst>(I))
2783+
if (II && II->getIntrinsicID() == Intrinsic::masked_load)
2784+
if (processMaskedLoad(II))
2785+
return true;
2786+
27372787
// For conditional branches, we can perform simple conditional propagation on
27382788
// the condition value itself.
27392789
if (BranchInst *BI = dyn_cast<BranchInst>(I)) {

llvm/test/Transforms/GVN/masked-load-store.ll

Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,164 @@ define <128 x i8> @f1(ptr %a0, <128 x i8> %a1, <128 x i8> %a2) {
3636
ret <128 x i8> %v4
3737
}
3838

39+
define <4 x float> @forward_masked_load(ptr %0, ptr %1) {
40+
; CHECK-LABEL: @forward_masked_load(
41+
; CHECK-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0:%.*]], i32 1, <4 x i1> splat (i1 true), <4 x float> zeroinitializer)
42+
; CHECK-NEXT: call void @llvm.masked.store.v4f32.p0(<4 x float> [[TMP4]], ptr [[TMP1:%.*]], i32 1, <4 x i1> splat (i1 true))
43+
; CHECK-NEXT: ret <4 x float> [[TMP4]]
44+
;
45+
%6 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 4)
46+
%7 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 1, <4 x i1> %6, <4 x float> zeroinitializer)
47+
call void @llvm.masked.store.v4f32.p0(<4 x float> %7, ptr %1, i32 1, <4 x i1> %6)
48+
%8 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %1, i32 1, <4 x i1> %6, <4 x float> zeroinitializer)
49+
ret <4 x float> %8
50+
}
51+
52+
define <4 x float> @forward_binop_splat_i1_mask(ptr %0, ptr %1) {
53+
; CHECK-LABEL: @forward_binop_splat_i1_mask(
54+
; CHECK-NEXT: [[LOAD_0_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0:%.*]], i32 1, <4 x i1> splat (i1 true), <4 x float> zeroinitializer)
55+
; CHECK-NEXT: [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
56+
; CHECK-NEXT: [[LOAD_0_16:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[GEP_0_16]], i32 1, <4 x i1> splat (i1 true), <4 x float> zeroinitializer)
57+
; CHECK-NEXT: [[FMUL:%.*]] = fmul <4 x float> [[LOAD_0_0]], [[LOAD_0_16]]
58+
; CHECK-NEXT: call void @llvm.masked.store.v4f32.p0(<4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <4 x i1> splat (i1 true))
59+
; CHECK-NEXT: ret <4 x float> [[FMUL]]
60+
;
61+
%mask = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 4)
62+
%load.0.0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
63+
%gep.0.16 = getelementptr i8, ptr %0, i32 16
64+
%load.0.16 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %gep.0.16, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
65+
%fmul = fmul <4 x float> %load.0.0, %load.0.16
66+
call void @llvm.masked.store.v4f32.p0(<4 x float> %fmul, ptr %1, i32 1, <4 x i1> %mask)
67+
%load.1.0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %1, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
68+
ret <4 x float> %load.1.0
69+
}
70+
71+
define <4 x float> @forward_binop_with_sel(ptr %0, ptr %1, i32 %a, i32 %b, <4 x float> %passthrough) {
72+
; CHECK-LABEL: @forward_binop_with_sel(
73+
; CHECK-NEXT: [[MASK:%.*]] = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[A:%.*]], i32 [[B:%.*]])
74+
; CHECK-NEXT: [[LOAD_0_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0:%.*]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer)
75+
; CHECK-NEXT: [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
76+
; CHECK-NEXT: [[LOAD_0_16:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[GEP_0_16]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer)
77+
; CHECK-NEXT: [[FMUL:%.*]] = fmul <4 x float> [[LOAD_0_0]], [[LOAD_0_16]]
78+
; CHECK-NEXT: call void @llvm.masked.store.v4f32.p0(<4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <4 x i1> [[MASK]])
79+
; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[MASK]], <4 x float> [[FMUL]], <4 x float> [[PASSTHROUGH:%.*]]
80+
; CHECK-NEXT: ret <4 x float> [[TMP3]]
81+
;
82+
%mask = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %a, i32 %b)
83+
%load.0.0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
84+
%gep.0.16 = getelementptr i8, ptr %0, i32 16
85+
%load.0.16 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %gep.0.16, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
86+
%fmul = fmul <4 x float> %load.0.0, %load.0.16
87+
call void @llvm.masked.store.v4f32.p0(<4 x float> %fmul, ptr %1, i32 1, <4 x i1> %mask)
88+
%load.1.0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %1, i32 1, <4 x i1> %mask, <4 x float> %passthrough)
89+
ret <4 x float> %load.1.0
90+
}
91+
92+
define <vscale x 4 x float> @forward_masked_load_scalable(ptr %0, ptr %1, <vscale x 4 x float> %passthrough) {
93+
; CHECK-LABEL: @forward_masked_load_scalable(
94+
; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
95+
; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, <vscale x 4 x i1> [[TMP3]], <vscale x 4 x float> [[PASSTHROUGH:%.*]])
96+
; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP4]], ptr [[TMP1:%.*]], i32 1, <vscale x 4 x i1> [[TMP3]])
97+
; CHECK-NEXT: ret <vscale x 4 x float> [[TMP4]]
98+
;
99+
%6 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
100+
%7 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %0, i32 1, <vscale x 4 x i1> %6, <vscale x 4 x float> %passthrough)
101+
call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %7, ptr %1, i32 1, <vscale x 4 x i1> %6)
102+
%8 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, <vscale x 4 x i1> %6, <vscale x 4 x float> %passthrough)
103+
ret <vscale x 4 x float> %8
104+
}
105+
106+
define <vscale x 4 x float> @bail_on_different_passthrough(ptr %0, ptr %1, <vscale x 4 x float> %passthrough) {
107+
; CHECK-LABEL: @bail_on_different_passthrough(
108+
; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
109+
; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, <vscale x 4 x i1> [[TMP3]], <vscale x 4 x float> zeroinitializer)
110+
; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP4]], ptr [[TMP1:%.*]], i32 1, <vscale x 4 x i1> [[TMP3]])
111+
; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[TMP3]], <vscale x 4 x float> [[PASSTHROUGH:%.*]])
112+
; CHECK-NEXT: ret <vscale x 4 x float> [[TMP5]]
113+
;
114+
%6 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
115+
%7 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %0, i32 1, <vscale x 4 x i1> %6, <vscale x 4 x float> zeroinitializer)
116+
call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %7, ptr %1, i32 1, <vscale x 4 x i1> %6)
117+
%8 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, <vscale x 4 x i1> %6, <vscale x 4 x float> %passthrough)
118+
ret <vscale x 4 x float> %8
119+
}
120+
121+
define <vscale x 4 x float> @forward_binop_with_sel_scalable(ptr %0, ptr %1, <vscale x 4 x float> %passthrough) {
122+
; CHECK-LABEL: @forward_binop_with_sel_scalable(
123+
; CHECK-NEXT: [[MASK:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
124+
; CHECK-NEXT: [[LOAD_0_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
125+
; CHECK-NEXT: [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
126+
; CHECK-NEXT: [[LOAD_0_16:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[GEP_0_16]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
127+
; CHECK-NEXT: [[FMUL:%.*]] = fmul <vscale x 4 x float> [[LOAD_0_0]], [[LOAD_0_16]]
128+
; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <vscale x 4 x i1> [[MASK]])
129+
; CHECK-NEXT: [[TMP3:%.*]] = select <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> [[FMUL]], <vscale x 4 x float> [[PASSTHROUGH:%.*]]
130+
; CHECK-NEXT: ret <vscale x 4 x float> [[TMP3]]
131+
;
132+
%mask = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
133+
%load.0.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
134+
%gep.0.16 = getelementptr i8, ptr %0, i32 16
135+
%load.0.16 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %gep.0.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
136+
%fmul = fmul <vscale x 4 x float> %load.0.0, %load.0.16
137+
call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %fmul, ptr %1, i32 1, <vscale x 4 x i1> %mask)
138+
%load.1.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> %passthrough)
139+
ret <vscale x 4 x float> %load.1.0
140+
}
141+
142+
define <vscale x 4 x float> @load_mask_differs(ptr %0, ptr %1, <vscale x 4 x float> %passthrough) {
143+
; CHECK-LABEL: @load_mask_differs(
144+
; CHECK-NEXT: [[MASK0:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8)
145+
; CHECK-NEXT: [[MASK1:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
146+
; CHECK-NEXT: [[LOAD_0_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> zeroinitializer)
147+
; CHECK-NEXT: [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
148+
; CHECK-NEXT: [[LOAD_0_16:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[GEP_0_16]], i32 1, <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> zeroinitializer)
149+
; CHECK-NEXT: [[FMUL:%.*]] = fmul <vscale x 4 x float> [[LOAD_0_0]], [[LOAD_0_16]]
150+
; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <vscale x 4 x i1> [[MASK0]])
151+
; CHECK-NEXT: [[LOAD_1_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[MASK1]], <vscale x 4 x float> [[PASSTHROUGH:%.*]])
152+
; CHECK-NEXT: ret <vscale x 4 x float> [[LOAD_1_0]]
153+
;
154+
%mask0 = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8)
155+
%mask1 = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
156+
%load.0.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %0, i32 1, <vscale x 4 x i1> %mask0, <vscale x 4 x float> zeroinitializer)
157+
%gep.0.16 = getelementptr i8, ptr %0, i32 16
158+
%load.0.16 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %gep.0.16, i32 1, <vscale x 4 x i1> %mask0, <vscale x 4 x float> zeroinitializer)
159+
%fmul = fmul <vscale x 4 x float> %load.0.0, %load.0.16
160+
call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %fmul, ptr %1, i32 1, <vscale x 4 x i1> %mask0)
161+
%load.1.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, <vscale x 4 x i1> %mask1, <vscale x 4 x float> %passthrough)
162+
ret <vscale x 4 x float> %load.1.0
163+
}
164+
165+
define <vscale x 4 x float> @store_mask_differs(ptr %0, ptr %1, <vscale x 4 x float> %passthrough) {
166+
; CHECK-LABEL: @store_mask_differs(
167+
; CHECK-NEXT: [[MASK0:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8)
168+
; CHECK-NEXT: [[MASK1:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
169+
; CHECK-NEXT: [[LOAD_0_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> zeroinitializer)
170+
; CHECK-NEXT: [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
171+
; CHECK-NEXT: [[LOAD_0_16:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[GEP_0_16]], i32 1, <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> zeroinitializer)
172+
; CHECK-NEXT: [[FMUL:%.*]] = fmul <vscale x 4 x float> [[LOAD_0_0]], [[LOAD_0_16]]
173+
; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <vscale x 4 x i1> [[MASK1]])
174+
; CHECK-NEXT: [[LOAD_1_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> [[PASSTHROUGH:%.*]])
175+
; CHECK-NEXT: ret <vscale x 4 x float> [[LOAD_1_0]]
176+
;
177+
%mask0 = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8)
178+
%mask1 = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
179+
%load.0.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %0, i32 1, <vscale x 4 x i1> %mask0, <vscale x 4 x float> zeroinitializer)
180+
%gep.0.16 = getelementptr i8, ptr %0, i32 16
181+
%load.0.16 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %gep.0.16, i32 1, <vscale x 4 x i1> %mask0, <vscale x 4 x float> zeroinitializer)
182+
%fmul = fmul <vscale x 4 x float> %load.0.0, %load.0.16
183+
call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %fmul, ptr %1, i32 1, <vscale x 4 x i1> %mask1)
184+
%load.1.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, <vscale x 4 x i1> %mask0, <vscale x 4 x float> %passthrough)
185+
ret <vscale x 4 x float> %load.1.0
186+
}
187+
188+
; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read)
189+
declare <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr captures(none), i32 immarg, <vscale x 4 x i1>, <vscale x 4 x float>) #1
190+
191+
; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: write)
192+
declare void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float>, ptr captures(none), i32 immarg, <vscale x 4 x i1>) #2
193+
194+
; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
195+
declare <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32, i32) #3
196+
39197
declare <128 x i8> @llvm.masked.load.v128i8.p0(ptr, i32, <128 x i1>, <128 x i8>)
40198
declare void @llvm.masked.store.v128i8.p0(<128 x i8>, ptr, i32, <128 x i1>)
41199

0 commit comments

Comments
 (0)