Skip to content

Commit 9100c92

Browse files
authored
[AArch64] Enable masked load/store for Streaming-SVE with -march=armv8-a+sme (#163133)
For subtarget aarch64, isLegalMaskedLoadStore() should not return false for Streaming-SVE. Thus now on usage of -march=armv8-a+sme & for workloads that contains loops with control flow where predication is data dependent on any array/vectors, masked load/stores along with necessary scalable vectorization constructs would be emitted. Fixes: #162797
1 parent cc8f7cd commit 9100c92

File tree

2 files changed

+188
-1
lines changed

2 files changed

+188
-1
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -312,7 +312,7 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
312312
}
313313

314314
bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) const {
315-
if (!ST->hasSVE())
315+
if (!ST->isSVEorStreamingSVEAvailable())
316316
return false;
317317

318318
// For fixed vectors, avoid scalarization if using SVE for them.
Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
; RUN: opt < %s -passes=loop-vectorize -S | FileCheck %s
2+
target triple = "aarch64-unknown-linux-gnu"
3+
4+
define void @wombat(i32 %arg, ptr %arg1, ptr %arg2, ptr %arg3, ptr %arg4, ptr %arg5, i8 %arg6) #0 {
5+
; CHECK-LABEL: define void @wombat(
6+
; CHECK-SAME: i32 [[ARG:%.*]], ptr [[ARG1:%.*]], ptr [[ARG2:%.*]], ptr [[ARG3:%.*]], ptr [[ARG4:%.*]], ptr [[ARG5:%.*]], i8 [[ARG6:%.*]]) #[[ATTR0:[0-9]+]] {
7+
; CHECK-NEXT: [[BB:.*:]]
8+
; CHECK-NEXT: [[ICMP:%.*]] = icmp sgt i32 [[ARG]], 0
9+
; CHECK-NEXT: br i1 [[ICMP]], label %[[BB7:.*]], label %[[BB25:.*]]
10+
; CHECK: [[BB7]]:
11+
; CHECK-NEXT: [[ZEXT:%.*]] = zext nneg i32 [[ARG]] to i64
12+
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
13+
; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
14+
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[ZEXT]], [[TMP1]]
15+
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
16+
; CHECK: [[VECTOR_MEMCHECK]]:
17+
; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[ARG1]], i64 [[ZEXT]]
18+
; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[ARG2]], i64 [[ZEXT]]
19+
; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[ARG5]], i64 [[ZEXT]]
20+
; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[ARG3]], i64 [[ZEXT]]
21+
; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i8, ptr [[ARG4]], i64 [[ZEXT]]
22+
; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[ARG1]], [[SCEVGEP1]]
23+
; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[ARG2]], [[SCEVGEP]]
24+
; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
25+
; CHECK-NEXT: [[BOUND05:%.*]] = icmp ult ptr [[ARG1]], [[SCEVGEP2]]
26+
; CHECK-NEXT: [[BOUND16:%.*]] = icmp ult ptr [[ARG5]], [[SCEVGEP]]
27+
; CHECK-NEXT: [[FOUND_CONFLICT7:%.*]] = and i1 [[BOUND05]], [[BOUND16]]
28+
; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT7]]
29+
; CHECK-NEXT: [[BOUND08:%.*]] = icmp ult ptr [[ARG1]], [[SCEVGEP3]]
30+
; CHECK-NEXT: [[BOUND19:%.*]] = icmp ult ptr [[ARG3]], [[SCEVGEP]]
31+
; CHECK-NEXT: [[FOUND_CONFLICT10:%.*]] = and i1 [[BOUND08]], [[BOUND19]]
32+
; CHECK-NEXT: [[CONFLICT_RDX11:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT10]]
33+
; CHECK-NEXT: [[BOUND012:%.*]] = icmp ult ptr [[ARG1]], [[SCEVGEP4]]
34+
; CHECK-NEXT: [[BOUND113:%.*]] = icmp ult ptr [[ARG4]], [[SCEVGEP]]
35+
; CHECK-NEXT: [[FOUND_CONFLICT14:%.*]] = and i1 [[BOUND012]], [[BOUND113]]
36+
; CHECK-NEXT: [[CONFLICT_RDX15:%.*]] = or i1 [[CONFLICT_RDX11]], [[FOUND_CONFLICT14]]
37+
; CHECK-NEXT: [[BOUND016:%.*]] = icmp ult ptr [[ARG2]], [[SCEVGEP2]]
38+
; CHECK-NEXT: [[BOUND117:%.*]] = icmp ult ptr [[ARG5]], [[SCEVGEP1]]
39+
; CHECK-NEXT: [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
40+
; CHECK-NEXT: [[CONFLICT_RDX19:%.*]] = or i1 [[CONFLICT_RDX15]], [[FOUND_CONFLICT18]]
41+
; CHECK-NEXT: [[BOUND020:%.*]] = icmp ult ptr [[ARG2]], [[SCEVGEP3]]
42+
; CHECK-NEXT: [[BOUND121:%.*]] = icmp ult ptr [[ARG3]], [[SCEVGEP1]]
43+
; CHECK-NEXT: [[FOUND_CONFLICT22:%.*]] = and i1 [[BOUND020]], [[BOUND121]]
44+
; CHECK-NEXT: [[CONFLICT_RDX23:%.*]] = or i1 [[CONFLICT_RDX19]], [[FOUND_CONFLICT22]]
45+
; CHECK-NEXT: [[BOUND024:%.*]] = icmp ult ptr [[ARG2]], [[SCEVGEP4]]
46+
; CHECK-NEXT: [[BOUND125:%.*]] = icmp ult ptr [[ARG4]], [[SCEVGEP1]]
47+
; CHECK-NEXT: [[FOUND_CONFLICT26:%.*]] = and i1 [[BOUND024]], [[BOUND125]]
48+
; CHECK-NEXT: [[CONFLICT_RDX27:%.*]] = or i1 [[CONFLICT_RDX23]], [[FOUND_CONFLICT26]]
49+
; CHECK-NEXT: br i1 [[CONFLICT_RDX27]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
50+
; CHECK: [[VECTOR_PH]]:
51+
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
52+
; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
53+
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[ZEXT]], [[TMP3]]
54+
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[ZEXT]], [[N_MOD_VF]]
55+
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[ARG6]], i64 0
56+
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
57+
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
58+
; CHECK: [[VECTOR_BODY]]:
59+
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
60+
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[ARG5]], i64 [[INDEX]]
61+
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP4]], align 1, !alias.scope [[META0:![0-9]+]]
62+
; CHECK-NEXT: [[TMP5:%.*]] = icmp uge <vscale x 16 x i8> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
63+
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[ARG1]], i64 [[INDEX]]
64+
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP6]], <vscale x 16 x i1> [[TMP5]], <vscale x 16 x i8> poison), !alias.scope [[META3:![0-9]+]], !noalias [[META5:![0-9]+]]
65+
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[ARG3]], i64 [[INDEX]]
66+
; CHECK-NEXT: [[WIDE_MASKED_LOAD28:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP7]], <vscale x 16 x i1> [[TMP5]], <vscale x 16 x i8> poison), !alias.scope [[META9:![0-9]+]]
67+
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[ARG4]], i64 [[INDEX]]
68+
; CHECK-NEXT: [[WIDE_MASKED_LOAD29:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP8]], <vscale x 16 x i1> [[TMP5]], <vscale x 16 x i8> poison), !alias.scope [[META10:![0-9]+]]
69+
; CHECK-NEXT: [[TMP9:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD29]], [[WIDE_MASKED_LOAD28]]
70+
; CHECK-NEXT: [[TMP10:%.*]] = add <vscale x 16 x i8> [[TMP9]], [[WIDE_MASKED_LOAD]]
71+
; CHECK-NEXT: call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP10]], ptr align 1 [[TMP6]], <vscale x 16 x i1> [[TMP5]]), !alias.scope [[META3]], !noalias [[META5]]
72+
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[ARG2]], i64 [[INDEX]]
73+
; CHECK-NEXT: [[WIDE_MASKED_LOAD30:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP11]], <vscale x 16 x i1> [[TMP5]], <vscale x 16 x i8> poison), !alias.scope [[META11:![0-9]+]], !noalias [[META12:![0-9]+]]
74+
; CHECK-NEXT: [[TMP12:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD28]], [[WIDE_MASKED_LOAD28]]
75+
; CHECK-NEXT: [[TMP13:%.*]] = add <vscale x 16 x i8> [[WIDE_MASKED_LOAD30]], [[TMP12]]
76+
; CHECK-NEXT: call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP13]], ptr align 1 [[TMP11]], <vscale x 16 x i1> [[TMP5]]), !alias.scope [[META11]], !noalias [[META12]]
77+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
78+
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
79+
; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
80+
; CHECK: [[MIDDLE_BLOCK]]:
81+
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[ZEXT]], [[N_VEC]]
82+
; CHECK-NEXT: br i1 [[CMP_N]], label %[[BB24:.*]], label %[[SCALAR_PH]]
83+
; CHECK: [[SCALAR_PH]]:
84+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[BB7]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
85+
; CHECK-NEXT: br label %[[BB8:.*]]
86+
; CHECK: [[BB8]]:
87+
; CHECK-NEXT: [[PHI:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[ADD22:%.*]], %[[BB21:.*]] ]
88+
; CHECK-NEXT: [[GETELEMENTPTR:%.*]] = getelementptr inbounds nuw i8, ptr [[ARG5]], i64 [[PHI]]
89+
; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr [[GETELEMENTPTR]], align 1
90+
; CHECK-NEXT: [[ICMP9:%.*]] = icmp ult i8 [[LOAD]], [[ARG6]]
91+
; CHECK-NEXT: br i1 [[ICMP9]], label %[[BB21]], label %[[BB10:.*]]
92+
; CHECK: [[BB10]]:
93+
; CHECK-NEXT: [[GETELEMENTPTR11:%.*]] = getelementptr inbounds nuw i8, ptr [[ARG1]], i64 [[PHI]]
94+
; CHECK-NEXT: [[LOAD12:%.*]] = load i8, ptr [[GETELEMENTPTR11]], align 1
95+
; CHECK-NEXT: [[GETELEMENTPTR13:%.*]] = getelementptr inbounds nuw i8, ptr [[ARG3]], i64 [[PHI]]
96+
; CHECK-NEXT: [[LOAD14:%.*]] = load i8, ptr [[GETELEMENTPTR13]], align 1
97+
; CHECK-NEXT: [[GETELEMENTPTR15:%.*]] = getelementptr inbounds nuw i8, ptr [[ARG4]], i64 [[PHI]]
98+
; CHECK-NEXT: [[LOAD16:%.*]] = load i8, ptr [[GETELEMENTPTR15]], align 1
99+
; CHECK-NEXT: [[MUL:%.*]] = mul i8 [[LOAD16]], [[LOAD14]]
100+
; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[LOAD12]]
101+
; CHECK-NEXT: store i8 [[ADD]], ptr [[GETELEMENTPTR11]], align 1
102+
; CHECK-NEXT: [[GETELEMENTPTR17:%.*]] = getelementptr inbounds nuw i8, ptr [[ARG2]], i64 [[PHI]]
103+
; CHECK-NEXT: [[LOAD18:%.*]] = load i8, ptr [[GETELEMENTPTR17]], align 1
104+
; CHECK-NEXT: [[MUL19:%.*]] = mul i8 [[LOAD14]], [[LOAD14]]
105+
; CHECK-NEXT: [[ADD20:%.*]] = add i8 [[LOAD18]], [[MUL19]]
106+
; CHECK-NEXT: store i8 [[ADD20]], ptr [[GETELEMENTPTR17]], align 1
107+
; CHECK-NEXT: br label %[[BB21]]
108+
; CHECK: [[BB21]]:
109+
; CHECK-NEXT: [[ADD22]] = add nuw nsw i64 [[PHI]], 1
110+
; CHECK-NEXT: [[ICMP23:%.*]] = icmp eq i64 [[ADD22]], [[ZEXT]]
111+
; CHECK-NEXT: br i1 [[ICMP23]], label %[[BB24]], label %[[BB8]], !llvm.loop [[LOOP17:![0-9]+]]
112+
; CHECK: [[BB24]]:
113+
; CHECK-NEXT: br label %[[BB25]]
114+
; CHECK: [[BB25]]:
115+
; CHECK-NEXT: ret void
116+
;
117+
bb:
118+
%icmp = icmp sgt i32 %arg, 0
119+
br i1 %icmp, label %bb7, label %bb25
120+
121+
bb7: ; preds = %bb
122+
%zext = zext nneg i32 %arg to i64
123+
br label %bb8
124+
125+
bb8: ; preds = %bb21, %bb7
126+
%phi = phi i64 [ 0, %bb7 ], [ %add22, %bb21 ]
127+
%getelementptr = getelementptr inbounds nuw i8, ptr %arg5, i64 %phi
128+
%load = load i8, ptr %getelementptr, align 1
129+
%icmp9 = icmp ult i8 %load, %arg6
130+
br i1 %icmp9, label %bb21, label %bb10
131+
132+
bb10: ; preds = %bb8
133+
%getelementptr11 = getelementptr inbounds nuw i8, ptr %arg1, i64 %phi
134+
%load12 = load i8, ptr %getelementptr11, align 1
135+
%getelementptr13 = getelementptr inbounds nuw i8, ptr %arg3, i64 %phi
136+
%load14 = load i8, ptr %getelementptr13, align 1
137+
%getelementptr15 = getelementptr inbounds nuw i8, ptr %arg4, i64 %phi
138+
%load16 = load i8, ptr %getelementptr15, align 1
139+
%mul = mul i8 %load16, %load14
140+
%add = add i8 %mul, %load12
141+
store i8 %add, ptr %getelementptr11, align 1
142+
%getelementptr17 = getelementptr inbounds nuw i8, ptr %arg2, i64 %phi
143+
%load18 = load i8, ptr %getelementptr17, align 1
144+
%mul19 = mul i8 %load14, %load14
145+
%add20 = add i8 %load18, %mul19
146+
store i8 %add20, ptr %getelementptr17, align 1
147+
br label %bb21
148+
149+
bb21: ; preds = %bb10, %bb8
150+
%add22 = add nuw nsw i64 %phi, 1
151+
%icmp23 = icmp eq i64 %add22, %zext
152+
br i1 %icmp23, label %bb24, label %bb8, !llvm.loop !0
153+
154+
bb24: ; preds = %bb21
155+
br label %bb25
156+
157+
bb25: ; preds = %bb24, %bb
158+
ret void
159+
}
160+
161+
attributes #0 = { uwtable vscale_range(1,16) "aarch64_pstate_sm_body" "target-features"="+fp-armv8,+neon,+sme,+v8a,-fmv" }
162+
163+
!0 = distinct !{!0, !1, !2, !3, !4}
164+
!1 = !{!"llvm.loop.mustprogress"}
165+
!2 = !{!"llvm.loop.vectorize.width", i32 16}
166+
!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
167+
!4 = !{!"llvm.loop.vectorize.enable", i1 true}
168+
;.
169+
; CHECK: [[META0]] = !{[[META1:![0-9]+]]}
170+
; CHECK: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]}
171+
; CHECK: [[META2]] = distinct !{[[META2]], !"LVerDomain"}
172+
; CHECK: [[META3]] = !{[[META4:![0-9]+]]}
173+
; CHECK: [[META4]] = distinct !{[[META4]], [[META2]]}
174+
; CHECK: [[META5]] = !{[[META6:![0-9]+]], [[META1]], [[META7:![0-9]+]], [[META8:![0-9]+]]}
175+
; CHECK: [[META6]] = distinct !{[[META6]], [[META2]]}
176+
; CHECK: [[META7]] = distinct !{[[META7]], [[META2]]}
177+
; CHECK: [[META8]] = distinct !{[[META8]], [[META2]]}
178+
; CHECK: [[META9]] = !{[[META7]]}
179+
; CHECK: [[META10]] = !{[[META8]]}
180+
; CHECK: [[META11]] = !{[[META6]]}
181+
; CHECK: [[META12]] = !{[[META1]], [[META7]], [[META8]]}
182+
; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META14:![0-9]+]], [[META15:![0-9]+]], [[META16:![0-9]+]]}
183+
; CHECK: [[META14]] = !{!"llvm.loop.mustprogress"}
184+
; CHECK: [[META15]] = !{!"llvm.loop.isvectorized", i32 1}
185+
; CHECK: [[META16]] = !{!"llvm.loop.unroll.runtime.disable"}
186+
; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META14]], [[META15]]}
187+
;.

0 commit comments

Comments
 (0)