Skip to content

Commit f831463

Browse files
authored
[MemoryLocation] Size Scalable Masked MemOps (#154785)
Scalable masked loads and stores with a get active lane mask whose size is less than or equal to the scalable minimum number of elements can be be proven to have a fixed size. Adding this infomation allows scalable masked loads and stores to benefit from alias analysis optimizations.
1 parent 05da160 commit f831463

File tree

2 files changed

+292
-10
lines changed

2 files changed

+292
-10
lines changed

llvm/lib/Analysis/MemoryLocation.cpp

Lines changed: 44 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include "llvm/IR/Instructions.h"
1313
#include "llvm/IR/IntrinsicInst.h"
1414
#include "llvm/IR/IntrinsicsARM.h"
15+
#include "llvm/IR/PatternMatch.h"
1516
#include "llvm/IR/Type.h"
1617
#include <optional>
1718
using namespace llvm;
@@ -150,6 +151,33 @@ MemoryLocation::getForDest(const CallBase *CB, const TargetLibraryInfo &TLI) {
150151
return MemoryLocation::getBeforeOrAfter(UsedV, CB->getAAMetadata());
151152
}
152153

154+
// If the mask for a memory op is a get active lane mask intrinsic
155+
// we can possibly infer the size of memory written or read
156+
static std::optional<FixedVectorType *>
157+
getKnownTypeFromMaskedOp(Value *Mask, VectorType *Ty) {
158+
using namespace llvm::PatternMatch;
159+
ConstantInt *Op0, *Op1;
160+
if (!match(Mask, m_Intrinsic<Intrinsic::get_active_lane_mask>(
161+
m_ConstantInt(Op0), m_ConstantInt(Op1))))
162+
return std::nullopt;
163+
164+
APInt LaneMaskLo = Op0->getValue();
165+
APInt LaneMaskHi = Op1->getValue();
166+
if (LaneMaskHi.ule(LaneMaskLo))
167+
return std::nullopt;
168+
169+
APInt NumElts = LaneMaskHi - LaneMaskLo;
170+
if (NumElts.ugt(Ty->getElementCount().getKnownMinValue())) {
171+
if (isa<ScalableVectorType>(Ty))
172+
return std::nullopt;
173+
// Unlike scalable vectors, fixed vector types are guaranteed to handle the
174+
// KnownMinValue and can be clamped
175+
NumElts = Ty->getElementCount().getKnownMinValue();
176+
}
177+
178+
return FixedVectorType::get(Ty->getElementType(), NumElts.getZExtValue());
179+
}
180+
153181
MemoryLocation MemoryLocation::getForArgument(const CallBase *Call,
154182
unsigned ArgIdx,
155183
const TargetLibraryInfo *TLI) {
@@ -213,20 +241,26 @@ MemoryLocation MemoryLocation::getForArgument(const CallBase *Call,
213241
cast<ConstantInt>(II->getArgOperand(0))->getZExtValue()),
214242
AATags);
215243

216-
case Intrinsic::masked_load:
244+
case Intrinsic::masked_load: {
217245
assert(ArgIdx == 0 && "Invalid argument index");
218-
return MemoryLocation(
219-
Arg,
220-
LocationSize::upperBound(DL.getTypeStoreSize(II->getType())),
221-
AATags);
222246

223-
case Intrinsic::masked_store:
247+
auto *Ty = cast<VectorType>(II->getType());
248+
if (auto KnownType = getKnownTypeFromMaskedOp(II->getOperand(2), Ty))
249+
return MemoryLocation(Arg, DL.getTypeStoreSize(*KnownType), AATags);
250+
251+
return MemoryLocation(
252+
Arg, LocationSize::upperBound(DL.getTypeStoreSize(Ty)), AATags);
253+
}
254+
case Intrinsic::masked_store: {
224255
assert(ArgIdx == 1 && "Invalid argument index");
256+
257+
auto *Ty = cast<VectorType>(II->getArgOperand(0)->getType());
258+
if (auto KnownType = getKnownTypeFromMaskedOp(II->getOperand(3), Ty))
259+
return MemoryLocation(Arg, DL.getTypeStoreSize(*KnownType), AATags);
260+
225261
return MemoryLocation(
226-
Arg,
227-
LocationSize::upperBound(
228-
DL.getTypeStoreSize(II->getArgOperand(0)->getType())),
229-
AATags);
262+
Arg, LocationSize::upperBound(DL.getTypeStoreSize(Ty)), AATags);
263+
}
230264

231265
case Intrinsic::invariant_end:
232266
// The first argument to an invariant.end is a "descriptor" type (e.g. a
Lines changed: 248 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,248 @@
1+
; RUN: opt < %s -aa-pipeline=basic-aa -passes=dse -S | FileCheck %s
2+
3+
define <vscale x 4 x float> @dead_scalable_store(ptr %0) {
4+
; CHECK-LABEL: define <vscale x 4 x float> @dead_scalable_store(
5+
; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
6+
; CHECK-NOT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.32, ptr nonnull %gep.arr.32, i32 1, <vscale x 4 x i1> %mask)
7+
; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.48, ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask)
8+
;
9+
%arr = alloca [64 x i32], align 4
10+
%mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
11+
12+
%gep.0.16 = getelementptr inbounds nuw i8, ptr %0, i64 16
13+
%gep.0.32 = getelementptr inbounds nuw i8, ptr %0, i64 32
14+
%gep.0.48 = getelementptr inbounds nuw i8, ptr %0, i64 48
15+
%gep.arr.16 = getelementptr inbounds nuw i8, ptr %arr, i64 16
16+
%gep.arr.32 = getelementptr inbounds nuw i8, ptr %arr, i64 32
17+
%gep.arr.48 = getelementptr inbounds nuw i8, ptr %arr, i64 48
18+
19+
%load.0.16 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
20+
call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
21+
22+
%load.0.32 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.32, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
23+
call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.32, ptr nonnull %gep.arr.32, i32 1, <vscale x 4 x i1> %mask)
24+
25+
%load.0.48 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.48, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
26+
call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.48, ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask)
27+
28+
%faddop0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
29+
%faddop1 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
30+
%fadd = fadd <vscale x 4 x float> %faddop0, %faddop1
31+
32+
ret <vscale x 4 x float> %fadd
33+
}
34+
35+
define <4 x float> @dead_scalable_store_fixed(ptr %0) {
36+
; CHECK-LABEL: define <4 x float> @dead_scalable_store_fixed(
37+
; CHECK: call void @llvm.masked.store.v4f32.p0(<4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <4 x i1> %mask)
38+
; CHECK-NOT: call void @llvm.masked.store.v4f32.p0(<4 x float> %load.0.32, ptr nonnull %gep.arr.36, i32 1, <4 x i1> %mask2)
39+
; CHECK: call void @llvm.masked.store.v4f32.p0(<4 x float> %load.0.48, ptr nonnull %gep.arr.48, i32 1, <4 x i1> %mask)
40+
;
41+
%arr = alloca [64 x i32], align 4
42+
%mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 4)
43+
%mask2 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 3)
44+
45+
%gep.0.16 = getelementptr inbounds nuw i8, ptr %0, i64 16
46+
%gep.0.36 = getelementptr inbounds nuw i8, ptr %0, i64 36
47+
%gep.0.48 = getelementptr inbounds nuw i8, ptr %0, i64 48
48+
%gep.arr.16 = getelementptr inbounds nuw i8, ptr %arr, i64 16
49+
%gep.arr.36 = getelementptr inbounds nuw i8, ptr %arr, i64 36
50+
%gep.arr.48 = getelementptr inbounds nuw i8, ptr %arr, i64 48
51+
52+
%load.0.16 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull %gep.0.16, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
53+
call void @llvm.masked.store.v4f32.p0(<4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <4 x i1> %mask)
54+
55+
%load.0.36 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull %gep.0.36, i32 1, <4 x i1> %mask2, <4 x float> zeroinitializer)
56+
call void @llvm.masked.store.v4f32.p0(<4 x float> %load.0.36, ptr nonnull %gep.arr.36, i32 1, <4 x i1> %mask2)
57+
58+
%load.0.48 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull %gep.0.48, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
59+
call void @llvm.masked.store.v4f32.p0(<4 x float> %load.0.48, ptr nonnull %gep.arr.48, i32 1, <4 x i1> %mask)
60+
61+
%faddop0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull %gep.arr.16, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
62+
%faddop1 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull %gep.arr.48, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
63+
%fadd = fadd <4 x float> %faddop0, %faddop1
64+
65+
ret <4 x float> %fadd
66+
}
67+
68+
define <vscale x 4 x float> @scalable_store_partial_overwrite(ptr %0) {
69+
; CHECK-LABEL: define <vscale x 4 x float> @scalable_store_partial_overwrite(
70+
; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
71+
; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.30, ptr nonnull %gep.arr.30, i32 1, <vscale x 4 x i1> %mask)
72+
; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.48, ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask)
73+
;
74+
%arr = alloca [64 x i32], align 4
75+
%mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
76+
77+
%gep.0.16 = getelementptr inbounds nuw i8, ptr %0, i64 16
78+
%gep.0.30 = getelementptr inbounds nuw i8, ptr %0, i64 30
79+
%gep.0.48 = getelementptr inbounds nuw i8, ptr %0, i64 48
80+
%gep.arr.16 = getelementptr inbounds nuw i8, ptr %arr, i64 16
81+
%gep.arr.30 = getelementptr inbounds nuw i8, ptr %arr, i64 30
82+
%gep.arr.48 = getelementptr inbounds nuw i8, ptr %arr, i64 48
83+
84+
%load.0.16 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
85+
call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
86+
87+
%load.0.30 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.30, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
88+
call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.30, ptr nonnull %gep.arr.30, i32 1, <vscale x 4 x i1> %mask)
89+
90+
%load.0.48 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.48, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
91+
call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.48, ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask)
92+
93+
%faddop0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
94+
%faddop1 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
95+
%fadd = fadd <vscale x 4 x float> %faddop0, %faddop1
96+
97+
ret <vscale x 4 x float> %fadd
98+
}
99+
100+
define <vscale x 4 x float> @dead_scalable_store_small_mask(ptr %0) {
101+
; CHECK-LABEL: define <vscale x 4 x float> @dead_scalable_store_small_mask(
102+
; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
103+
; CHECK-NOT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.30, ptr nonnull %gep.arr.30, i32 1, <vscale x 4 x i1> %mask)
104+
; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.46, ptr nonnull %gep.arr.46, i32 1, <vscale x 4 x i1> %mask)
105+
%arr = alloca [64 x i32], align 4
106+
%mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
107+
108+
%gep.0.16 = getelementptr inbounds nuw i8, ptr %0, i64 16
109+
%gep.0.30 = getelementptr inbounds nuw i8, ptr %0, i64 30
110+
%gep.0.46 = getelementptr inbounds nuw i8, ptr %0, i64 46
111+
%gep.arr.16 = getelementptr inbounds nuw i8, ptr %arr, i64 16
112+
%gep.arr.30 = getelementptr inbounds nuw i8, ptr %arr, i64 30
113+
%gep.arr.46 = getelementptr inbounds nuw i8, ptr %arr, i64 46
114+
115+
%load.0.16 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
116+
call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
117+
118+
%load.0.30 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.30, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
119+
call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.30, ptr nonnull %gep.arr.30, i32 1, <vscale x 4 x i1> %mask)
120+
121+
%load.0.46 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.0.46, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
122+
call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.46, ptr nonnull %gep.arr.46, i32 1, <vscale x 4 x i1> %mask)
123+
124+
%smallmask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.32(i32 0, i32 2)
125+
%faddop0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %smallmask, <vscale x 4 x float> zeroinitializer)
126+
%faddop1 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %gep.arr.46, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
127+
%fadd = fadd <vscale x 4 x float> %faddop0, %faddop1
128+
129+
ret <vscale x 4 x float> %fadd
130+
}
131+
132+
define <vscale x 4 x float> @dead_scalar_store(ptr noalias %0, ptr %1) {
133+
; CHECK-LABEL: define <vscale x 4 x float> @dead_scalar_store(
134+
; CHECK-NOT: store i32 20, ptr %gep.1.12
135+
;
136+
%mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i128(i128 0, i128 4)
137+
%gep.1.12 = getelementptr inbounds nuw i8, ptr %1, i64 12
138+
store i32 20, ptr %gep.1.12
139+
140+
%load.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
141+
call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0, ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask)
142+
%retval = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
143+
ret <vscale x 4 x float> %retval
144+
}
145+
146+
147+
; CHECK-LABEL: define <4 x float> @dead_scalable_store_fixed_large_mask(
148+
; CHECK-NOT: store i32 20, ptr %1
149+
; CHECK: store i32 50, ptr %gep.5
150+
define <4 x float> @dead_scalable_store_fixed_large_mask(ptr noalias %0, ptr %1) {
151+
%mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 7)
152+
store i32 20, ptr %1
153+
154+
%gep.5 = getelementptr inbounds nuw i32, ptr %1, i64 5
155+
store i32 50, ptr %gep.5
156+
157+
%load.0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull %0, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
158+
call void @llvm.masked.store.v4f32.p0(<4 x float> %load.0, ptr nonnull %1, i32 1, <4 x i1> %mask)
159+
%retval = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull %1, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
160+
ret <4 x float> %retval
161+
}
162+
163+
; We don't know if the scalar store is dead as we can't determine vscale.
164+
; This get active lane mask may cover 4 or 8 integers
165+
define <vscale x 4 x float> @mask_gt_minimum_num_elts(ptr noalias %0, ptr %1) {
166+
; CHECK-LABEL: define <vscale x 4 x float> @mask_gt_minimum_num_elts(
167+
; CHECK: store i32 10, ptr %gep.1.12
168+
; CHECK: store i32 20, ptr %gep.1.28
169+
;
170+
%mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8)
171+
%gep.1.12 = getelementptr inbounds nuw i8, ptr %1, i64 12
172+
store i32 10, ptr %gep.1.12
173+
%gep.1.28 = getelementptr inbounds nuw i8, ptr %1, i64 28
174+
store i32 20, ptr %gep.1.28
175+
176+
%load.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
177+
call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0, ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask)
178+
%retval = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
179+
ret <vscale x 4 x float> %retval
180+
}
181+
182+
; Don't do anything if the mask's Op1 < Op0
183+
define <vscale x 4 x float> @active_lane_mask_lt(ptr noalias %0, ptr %1) {
184+
; CHECK-LABEL: define <vscale x 4 x float> @active_lane_mask_lt(
185+
; CHECK: store i32 20, ptr %1
186+
;
187+
%mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 4, i32 2)
188+
store i32 20, ptr %1
189+
190+
%load.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
191+
call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0, ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask)
192+
%retval = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
193+
ret <vscale x 4 x float> %retval
194+
}
195+
196+
; Don't do anything if the mask's Op1 == Op0
197+
define <vscale x 4 x float> @active_lane_mask_eq(ptr noalias %0, ptr %1) {
198+
; CHECK-LABEL: define <vscale x 4 x float> @active_lane_mask_eq(
199+
; CHECK: store i32 20, ptr %1
200+
;
201+
%mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 2, i32 2)
202+
store i32 20, ptr %1
203+
204+
%load.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
205+
call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0, ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask)
206+
%retval = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
207+
ret <vscale x 4 x float> %retval
208+
}
209+
210+
define <vscale x 16 x i8> @scalar_stores_small_mask(ptr noalias %0, ptr %1) {
211+
; CHECK-LABEL: define <vscale x 16 x i8> @scalar_stores_small_mask(
212+
; CHECK-NOT: store i8 60, ptr %gep.1.6
213+
; CHECK: store i8 120, ptr %gep.1.8
214+
;
215+
%mask = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i8.i8(i8 0, i8 7)
216+
%gep.1.6 = getelementptr inbounds nuw i8, ptr %1, i64 6
217+
store i8 60, ptr %gep.1.6
218+
%gep.1.8 = getelementptr inbounds nuw i8, ptr %1, i64 8
219+
store i8 120, ptr %gep.1.8
220+
221+
%load.0 = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr nonnull %0, i32 1, <vscale x 16 x i1> %mask, <vscale x 16 x i8> zeroinitializer)
222+
call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %load.0, ptr %1, i32 1, <vscale x 16 x i1> %mask)
223+
%retval = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %1, i32 1, <vscale x 16 x i1> %mask, <vscale x 16 x i8> zeroinitializer)
224+
ret <vscale x 16 x i8> %retval
225+
}
226+
227+
define <vscale x 4 x float> @dead_scalar_store_offset(ptr noalias %0, ptr %1) {
228+
; CHECK-LABEL: define <vscale x 4 x float> @dead_scalar_store_offset(
229+
; CHECK-NOT: store i32 10, ptr %gep.1.0
230+
; CHECK-NOT: store i32 20, ptr %gep.1.4
231+
; CHECK-NOT: store i32 30, ptr %gep.1.8
232+
; CHECK: store i32 40, ptr %gep.1.12
233+
;
234+
%mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 1, i32 4)
235+
%gep.1.0 = getelementptr inbounds nuw i8, ptr %1, i64 0
236+
store i32 10, ptr %gep.1.0
237+
%gep.1.4 = getelementptr inbounds nuw i8, ptr %1, i64 4
238+
store i32 20, ptr %gep.1.4
239+
%gep.1.8 = getelementptr inbounds nuw i8, ptr %1, i64 8
240+
store i32 30, ptr %gep.1.8
241+
%gep.1.12 = getelementptr inbounds nuw i8, ptr %1, i64 12
242+
store i32 40, ptr %gep.1.12
243+
244+
%load.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
245+
call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0, ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask)
246+
%retval = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
247+
ret <vscale x 4 x float> %retval
248+
}

0 commit comments

Comments
 (0)