Skip to content

Commit 7464b86

Browse files
authored
[AArch64][SVE] Add SubtargetFeature to disable lowering unpredicated loads/stores as LDR/STR (llvm#170256)
PR llvm#127837 changed the lowering for unpredicated loads/stores to use LDR/STR instead of LD1/ST1. However, on some CPUs, such as A64FX, there is a performance difference between LD1/ST1 and LDR/STR. As a result, the lowering introduced in llvm#127837 can cause a performance regression on these targets. This patch adds a SubtargetFeature `disable-unpredicated-ld-st-lower` to disable this lowering. It is enabled for the A64FX target.
1 parent aebab05 commit 7464b86

File tree

6 files changed

+211
-2
lines changed

6 files changed

+211
-2
lines changed

llvm/lib/Target/AArch64/AArch64Features.td

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -915,6 +915,10 @@ def FeatureUseWzrToVecMove : SubtargetFeature<"use-wzr-to-vec-move",
915915
"UseWzrToVecMove", "true",
916916
"Move from WZR to insert 0 into vector registers">;
917917

918+
def FeatureDisableUnpredicatedLdStLower : SubtargetFeature<
919+
"disable-unpredicated-ld-st-lower", "DisableUnpredicatedLdStLower",
920+
"true", "Disable lowering unpredicated loads/stores as LDR/STR">;
921+
918922
//===----------------------------------------------------------------------===//
919923
// Architectures.
920924
//

llvm/lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -443,6 +443,8 @@ def AllowMisalignedMemAccesses
443443

444444
def UseWzrToVecMove : Predicate<"Subtarget->useWzrToVecMove()">;
445445

446+
def AllowUnpredicatedLdStLower
447+
: Predicate<"!Subtarget->disableUnpredicatedLdStLower()">;
446448

447449
//===----------------------------------------------------------------------===//
448450
// AArch64-specific DAG Nodes.

llvm/lib/Target/AArch64/AArch64Processors.td

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -306,7 +306,8 @@ def TuneA64FX : SubtargetFeature<"a64fx", "ARMProcFamily", "A64FX",
306306
FeatureAggressiveFMA,
307307
FeatureArithmeticBccFusion,
308308
FeatureStorePairSuppress,
309-
FeaturePredictableSelectIsExpensive]>;
309+
FeaturePredictableSelectIsExpensive,
310+
FeatureDisableUnpredicatedLdStLower]>;
310311

311312
def TuneMONAKA : SubtargetFeature<"fujitsu-monaka", "ARMProcFamily", "MONAKA",
312313
"Fujitsu FUJITSU-MONAKA processors", [

llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3165,7 +3165,7 @@ let Predicates = [HasSVE_or_SME] in {
31653165
}
31663166

31673167
// Allow using LDR/STR to avoid the predicate dependence.
3168-
let Predicates = [HasSVE_or_SME, IsLE, AllowMisalignedMemAccesses] in
3168+
let Predicates = [HasSVE_or_SME, IsLE, AllowMisalignedMemAccesses, AllowUnpredicatedLdStLower] in
31693169
foreach Ty = [ nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv8f16, nxv4f32, nxv2f64, nxv8bf16 ] in {
31703170
let AddedComplexity = 2 in {
31713171
def : Pat<(Ty (load (am_sve_indexed_s9 GPR64sp:$base, simm9:$offset))),

llvm/test/CodeGen/AArch64/sve-ld1-addressing-mode-reg-imm.ll

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
22
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
3+
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+disable-unpredicated-ld-st-lower < %s | FileCheck --check-prefixes=COMMON-NO-UPLS-LOWER,NO-UPLS-LOWER %s
4+
; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=a64fx < %s | FileCheck --check-prefixes=COMMON-NO-UPLS-LOWER,A64FX %s
35

46
; LD1B
57

@@ -8,6 +10,12 @@ define <vscale x 16 x i8> @ld1b_lower_bound(ptr %a) {
810
; CHECK: // %bb.0:
911
; CHECK-NEXT: ldr z0, [x0, #-8, mul vl]
1012
; CHECK-NEXT: ret
13+
;
14+
; COMMON-NO-UPLS-LOWER-LABEL: ld1b_lower_bound:
15+
; COMMON-NO-UPLS-LOWER: // %bb.0:
16+
; COMMON-NO-UPLS-LOWER-NEXT: ptrue p0.b
17+
; COMMON-NO-UPLS-LOWER-NEXT: ld1b { z0.b }, p0/z, [x0, #-8, mul vl]
18+
; COMMON-NO-UPLS-LOWER-NEXT: ret
1119
%base = getelementptr <vscale x 16 x i8>, ptr %a, i64 -8
1220
%load = load <vscale x 16 x i8>, ptr %base
1321
ret <vscale x 16 x i8> %load
@@ -18,6 +26,12 @@ define <vscale x 16 x i8> @ld1b_inbound(ptr %a) {
1826
; CHECK: // %bb.0:
1927
; CHECK-NEXT: ldr z0, [x0, #2, mul vl]
2028
; CHECK-NEXT: ret
29+
;
30+
; COMMON-NO-UPLS-LOWER-LABEL: ld1b_inbound:
31+
; COMMON-NO-UPLS-LOWER: // %bb.0:
32+
; COMMON-NO-UPLS-LOWER-NEXT: ptrue p0.b
33+
; COMMON-NO-UPLS-LOWER-NEXT: ld1b { z0.b }, p0/z, [x0, #2, mul vl]
34+
; COMMON-NO-UPLS-LOWER-NEXT: ret
2135
%base = getelementptr <vscale x 16 x i8>, ptr %a, i64 2
2236
%load = load <vscale x 16 x i8>, ptr %base
2337
ret <vscale x 16 x i8> %load
@@ -28,6 +42,12 @@ define <vscale x 16 x i8> @ld1b_upper_bound(ptr %a) {
2842
; CHECK: // %bb.0:
2943
; CHECK-NEXT: ldr z0, [x0, #7, mul vl]
3044
; CHECK-NEXT: ret
45+
;
46+
; COMMON-NO-UPLS-LOWER-LABEL: ld1b_upper_bound:
47+
; COMMON-NO-UPLS-LOWER: // %bb.0:
48+
; COMMON-NO-UPLS-LOWER-NEXT: ptrue p0.b
49+
; COMMON-NO-UPLS-LOWER-NEXT: ld1b { z0.b }, p0/z, [x0, #7, mul vl]
50+
; COMMON-NO-UPLS-LOWER-NEXT: ret
3151
%base = getelementptr <vscale x 16 x i8>, ptr %a, i64 7
3252
%load = load <vscale x 16 x i8>, ptr %base
3353
ret <vscale x 16 x i8> %load
@@ -38,6 +58,13 @@ define <vscale x 16 x i8> @ld1b_out_of_upper_bound(ptr %a) {
3858
; CHECK: // %bb.0:
3959
; CHECK-NEXT: ldr z0, [x0, #8, mul vl]
4060
; CHECK-NEXT: ret
61+
;
62+
; COMMON-NO-UPLS-LOWER-LABEL: ld1b_out_of_upper_bound:
63+
; COMMON-NO-UPLS-LOWER: // %bb.0:
64+
; COMMON-NO-UPLS-LOWER-NEXT: ptrue p0.b
65+
; COMMON-NO-UPLS-LOWER-NEXT: rdvl x8, #8
66+
; COMMON-NO-UPLS-LOWER-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
67+
; COMMON-NO-UPLS-LOWER-NEXT: ret
4168
%base = getelementptr <vscale x 16 x i8>, ptr %a, i64 8
4269
%load = load <vscale x 16 x i8>, ptr %base
4370
ret <vscale x 16 x i8> %load
@@ -48,6 +75,13 @@ define <vscale x 16 x i8> @ld1b_out_of_lower_bound(ptr %a) {
4875
; CHECK: // %bb.0:
4976
; CHECK-NEXT: ldr z0, [x0, #-9, mul vl]
5077
; CHECK-NEXT: ret
78+
;
79+
; COMMON-NO-UPLS-LOWER-LABEL: ld1b_out_of_lower_bound:
80+
; COMMON-NO-UPLS-LOWER: // %bb.0:
81+
; COMMON-NO-UPLS-LOWER-NEXT: ptrue p0.b
82+
; COMMON-NO-UPLS-LOWER-NEXT: rdvl x8, #-9
83+
; COMMON-NO-UPLS-LOWER-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
84+
; COMMON-NO-UPLS-LOWER-NEXT: ret
5185
%base = getelementptr <vscale x 16 x i8>, ptr %a, i64 -9
5286
%load = load <vscale x 16 x i8>, ptr %base
5387
ret <vscale x 16 x i8> %load
@@ -60,6 +94,12 @@ define <vscale x 8 x i16> @ld1h_inbound(ptr %a) {
6094
; CHECK: // %bb.0:
6195
; CHECK-NEXT: ldr z0, [x0, #-2, mul vl]
6296
; CHECK-NEXT: ret
97+
;
98+
; COMMON-NO-UPLS-LOWER-LABEL: ld1h_inbound:
99+
; COMMON-NO-UPLS-LOWER: // %bb.0:
100+
; COMMON-NO-UPLS-LOWER-NEXT: ptrue p0.h
101+
; COMMON-NO-UPLS-LOWER-NEXT: ld1h { z0.h }, p0/z, [x0, #-2, mul vl]
102+
; COMMON-NO-UPLS-LOWER-NEXT: ret
63103
%base = getelementptr <vscale x 8 x i16>, ptr %a, i64 -2
64104
%load = load <vscale x 8 x i16>, ptr %base
65105
ret <vscale x 8 x i16> %load
@@ -72,6 +112,12 @@ define <vscale x 4 x i32> @ld1s_inbound(ptr %a) {
72112
; CHECK: // %bb.0:
73113
; CHECK-NEXT: ldr z0, [x0, #4, mul vl]
74114
; CHECK-NEXT: ret
115+
;
116+
; COMMON-NO-UPLS-LOWER-LABEL: ld1s_inbound:
117+
; COMMON-NO-UPLS-LOWER: // %bb.0:
118+
; COMMON-NO-UPLS-LOWER-NEXT: ptrue p0.s
119+
; COMMON-NO-UPLS-LOWER-NEXT: ld1w { z0.s }, p0/z, [x0, #4, mul vl]
120+
; COMMON-NO-UPLS-LOWER-NEXT: ret
75121
%base = getelementptr <vscale x 4 x i32>, ptr %a, i64 4
76122
%load = load <vscale x 4 x i32>, ptr %base
77123
ret <vscale x 4 x i32> %load
@@ -84,6 +130,12 @@ define <vscale x 2 x i64> @ld1d_inbound(ptr %a) {
84130
; CHECK: // %bb.0:
85131
; CHECK-NEXT: ldr z0, [x0, #6, mul vl]
86132
; CHECK-NEXT: ret
133+
;
134+
; COMMON-NO-UPLS-LOWER-LABEL: ld1d_inbound:
135+
; COMMON-NO-UPLS-LOWER: // %bb.0:
136+
; COMMON-NO-UPLS-LOWER-NEXT: ptrue p0.d
137+
; COMMON-NO-UPLS-LOWER-NEXT: ld1d { z0.d }, p0/z, [x0, #6, mul vl]
138+
; COMMON-NO-UPLS-LOWER-NEXT: ret
87139
%base = getelementptr <vscale x 2 x i64>, ptr %a, i64 6
88140
%load = load <vscale x 2 x i64>, ptr %base
89141
ret <vscale x 2 x i64> %load
@@ -97,6 +149,22 @@ define void @load_nxv6f16(ptr %a) {
97149
; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, #2, mul vl]
98150
; CHECK-NEXT: ld1h { z0.s }, p1/z, [x0]
99151
; CHECK-NEXT: ret
152+
;
153+
; NO-UPLS-LOWER-LABEL: load_nxv6f16:
154+
; NO-UPLS-LOWER: // %bb.0:
155+
; NO-UPLS-LOWER-NEXT: ptrue p0.d
156+
; NO-UPLS-LOWER-NEXT: ptrue p1.s
157+
; NO-UPLS-LOWER-NEXT: ld1h { z0.d }, p0/z, [x0, #2, mul vl]
158+
; NO-UPLS-LOWER-NEXT: ld1h { z0.s }, p1/z, [x0]
159+
; NO-UPLS-LOWER-NEXT: ret
160+
;
161+
; A64FX-LABEL: load_nxv6f16:
162+
; A64FX: // %bb.0:
163+
; A64FX-NEXT: ptrue p0.d
164+
; A64FX-NEXT: ld1h { z0.d }, p0/z, [x0, #2, mul vl]
165+
; A64FX-NEXT: ptrue p0.s
166+
; A64FX-NEXT: ld1h { z0.s }, p0/z, [x0]
167+
; A64FX-NEXT: ret
100168
%val = load volatile <vscale x 6 x half>, ptr %a
101169
ret void
102170
}
@@ -108,6 +176,22 @@ define void @load_nxv6f32(ptr %a) {
108176
; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, #2, mul vl]
109177
; CHECK-NEXT: ldr z0, [x0]
110178
; CHECK-NEXT: ret
179+
;
180+
; NO-UPLS-LOWER-LABEL: load_nxv6f32:
181+
; NO-UPLS-LOWER: // %bb.0:
182+
; NO-UPLS-LOWER-NEXT: ptrue p0.d
183+
; NO-UPLS-LOWER-NEXT: ptrue p1.s
184+
; NO-UPLS-LOWER-NEXT: ld1w { z0.d }, p0/z, [x0, #2, mul vl]
185+
; NO-UPLS-LOWER-NEXT: ld1w { z0.s }, p1/z, [x0]
186+
; NO-UPLS-LOWER-NEXT: ret
187+
;
188+
; A64FX-LABEL: load_nxv6f32:
189+
; A64FX: // %bb.0:
190+
; A64FX-NEXT: ptrue p0.d
191+
; A64FX-NEXT: ld1w { z0.d }, p0/z, [x0, #2, mul vl]
192+
; A64FX-NEXT: ptrue p0.s
193+
; A64FX-NEXT: ld1w { z0.s }, p0/z, [x0]
194+
; A64FX-NEXT: ret
111195
%val = load volatile <vscale x 6 x float>, ptr %a
112196
ret void
113197
}
@@ -119,6 +203,22 @@ define void @load_nxv12f16(ptr %a) {
119203
; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, #2, mul vl]
120204
; CHECK-NEXT: ldr z0, [x0]
121205
; CHECK-NEXT: ret
206+
;
207+
; NO-UPLS-LOWER-LABEL: load_nxv12f16:
208+
; NO-UPLS-LOWER: // %bb.0:
209+
; NO-UPLS-LOWER-NEXT: ptrue p0.s
210+
; NO-UPLS-LOWER-NEXT: ptrue p1.h
211+
; NO-UPLS-LOWER-NEXT: ld1h { z0.s }, p0/z, [x0, #2, mul vl]
212+
; NO-UPLS-LOWER-NEXT: ld1h { z0.h }, p1/z, [x0]
213+
; NO-UPLS-LOWER-NEXT: ret
214+
;
215+
; A64FX-LABEL: load_nxv12f16:
216+
; A64FX: // %bb.0:
217+
; A64FX-NEXT: ptrue p0.s
218+
; A64FX-NEXT: ld1h { z0.s }, p0/z, [x0, #2, mul vl]
219+
; A64FX-NEXT: ptrue p0.h
220+
; A64FX-NEXT: ld1h { z0.h }, p0/z, [x0]
221+
; A64FX-NEXT: ret
122222
%val = load volatile <vscale x 12 x half>, ptr %a
123223
ret void
124224
}

0 commit comments

Comments
 (0)