Skip to content

Commit 6d234a0

Browse files
committed
[AArch64][SVE] Add SubtargetFeature to disable lowering unpredicated loads/stores as LDR/STR
PR llvm#127837 changed the lowering for unpredicated loads/stores to use LDR/STR instead of LD1/ST1. However, on some CPUs, such as A64FX, there is a performance difference between LD1/ST1 and LDR/STR. As a result, the lowering introduced in llvm#127837 can cause a performance regression on these targets. This patch adds a SubtargetFeature to disable this lowering and prevent the regression.
1 parent 036279a commit 6d234a0

File tree

5 files changed

+38
-2
lines changed

5 files changed

+38
-2
lines changed

llvm/lib/Target/AArch64/AArch64Features.td

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -915,6 +915,10 @@ def FeatureUseWzrToVecMove : SubtargetFeature<"use-wzr-to-vec-move",
915915
"UseWzrToVecMove", "true",
916916
"Move from WZR to insert 0 into vector registers">;
917917

918+
def FeatureDisableUnpredicatedLdStLower : SubtargetFeature<
919+
"disable-unpredicated-ld-st-lower", "DisableUnpredicatedLdStLower",
920+
"true", "Disable lowering unpredicated loads/stores as LDR/STR">;
921+
918922
//===----------------------------------------------------------------------===//
919923
// Architectures.
920924
//

llvm/lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -443,6 +443,8 @@ def AllowMisalignedMemAccesses
443443

444444
def UseWzrToVecMove : Predicate<"Subtarget->useWzrToVecMove()">;
445445

446+
def AllowUnpredicatedLdStLower
447+
: Predicate<"!Subtarget->disableUnpredicatedLdStLower()">;
446448

447449
//===----------------------------------------------------------------------===//
448450
// AArch64-specific DAG Nodes.

llvm/lib/Target/AArch64/AArch64Processors.td

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -306,7 +306,8 @@ def TuneA64FX : SubtargetFeature<"a64fx", "ARMProcFamily", "A64FX",
306306
FeatureAggressiveFMA,
307307
FeatureArithmeticBccFusion,
308308
FeatureStorePairSuppress,
309-
FeaturePredictableSelectIsExpensive]>;
309+
FeaturePredictableSelectIsExpensive,
310+
FeatureDisableUnpredicatedLdStLower]>;
310311

311312
def TuneMONAKA : SubtargetFeature<"fujitsu-monaka", "ARMProcFamily", "MONAKA",
312313
"Fujitsu FUJITSU-MONAKA processors", [

llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3164,7 +3164,7 @@ let Predicates = [HasSVE_or_SME] in {
31643164
}
31653165

31663166
// Allow using LDR/STR to avoid the predicate dependence.
3167-
let Predicates = [HasSVE_or_SME, IsLE, AllowMisalignedMemAccesses] in
3167+
let Predicates = [HasSVE_or_SME, IsLE, AllowMisalignedMemAccesses, AllowUnpredicatedLdStLower] in
31683168
foreach Ty = [ nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv8f16, nxv4f32, nxv2f64, nxv8bf16 ] in {
31693169
let AddedComplexity = 2 in {
31703170
def : Pat<(Ty (load (am_sve_indexed_s9 GPR64sp:$base, simm9:$offset))),
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2+
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+disable-unpredicated-ld-st-lower < %s | FileCheck %s
3+
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck --check-prefix CHECK-DEFAULT %s
4+
; RUN: llc -mcpu=a64fx < %s | FileCheck --check-prefix CHECK-A64FX %s
5+
6+
define void @nxv2i64(ptr %ldptr, ptr %stptr) {
7+
; CHECK-LABEL: nxv2i64:
8+
; CHECK: // %bb.0:
9+
; CHECK-NEXT: ptrue p0.d
10+
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
11+
; CHECK-NEXT: st1d { z0.d }, p0, [x1]
12+
; CHECK-NEXT: ret
13+
;
14+
; CHECK-DEFAULT-LABEL: nxv2i64:
15+
; CHECK-DEFAULT: // %bb.0:
16+
; CHECK-DEFAULT-NEXT: ldr z0, [x0]
17+
; CHECK-DEFAULT-NEXT: str z0, [x1]
18+
; CHECK-DEFAULT-NEXT: ret
19+
;
20+
; CHECK-A64FX-LABEL: nxv2i64:
21+
; CHECK-A64FX: // %bb.0:
22+
; CHECK-A64FX-NEXT: ptrue p0.d
23+
; CHECK-A64FX-NEXT: ld1d { z0.d }, p0/z, [x0]
24+
; CHECK-A64FX-NEXT: st1d { z0.d }, p0, [x1]
25+
; CHECK-A64FX-NEXT: ret
26+
%l3 = load <vscale x 2 x i64>, ptr %ldptr, align 8
27+
store <vscale x 2 x i64> %l3, ptr %stptr, align 8
28+
ret void
29+
}

0 commit comments

Comments
 (0)