-
Notifications
You must be signed in to change notification settings - Fork 14.7k
release/21.x: [AArch64] Replace expensive move from wzr by two moves via floating point immediate (#146538) #150001
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: release/21.x
Are you sure you want to change the base?
Conversation
…oint immediate (llvm#146538) We've noticed that inserting 0 into a known vector lane is implemented via a move from wzr, i.e., moving between register banks. We think it will be cheaper (and have seen improvements on our benchmarks) to materialize 0 into a floating point register and insert from there. PR: llvm#146538 (cherry picked from commit e333d60)
@fhahn What do you think about merging this PR to the release branch? |
@llvm/pr-subscribers-backend-aarch64 Author: None (llvmbot) ChangesBackport e333d60 Requested by: @juliannagele Full diff: https://github.com/llvm/llvm-project/pull/150001.diff 5 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td
index 9973df865ea17..c1c1f0a1024d0 100644
--- a/llvm/lib/Target/AArch64/AArch64Features.td
+++ b/llvm/lib/Target/AArch64/AArch64Features.td
@@ -840,6 +840,13 @@ def FeatureDisableFastIncVL : SubtargetFeature<"disable-fast-inc-vl",
"HasDisableFastIncVL", "true",
"Do not prefer INC/DEC, ALL, { 1, 2, 4 } over ADDVL">;
+// On most processors we want to avoid moving from WZR to vector registers
+// (relying on materializing 0 to a FPR and moving from there instead),
+// but on some (in-order) cores it's preferable to avoid the extra instruction instead.
+def FeatureUseWzrToVecMove : SubtargetFeature<"use-wzr-to-vec-move",
+ "UseWzrToVecMove", "true",
+ "Move from WZR to insert 0 into vector registers">;
+
//===----------------------------------------------------------------------===//
// Architectures.
//
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index ddc685fae5e9a..5ad8fdb07e56f 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -419,6 +419,8 @@ def AArch64LocalRecover : SDNode<"ISD::LOCAL_RECOVER",
def AllowMisalignedMemAccesses : Predicate<"!Subtarget->requiresStrictAlign()">;
+def UseWzrToVecMove : Predicate<"Subtarget->useWzrToVecMove()">;
+
//===----------------------------------------------------------------------===//
// AArch64-specific DAG Nodes.
@@ -7376,6 +7378,7 @@ def : Pat<(v4f16 (vector_insert (v4f16 V64:$Rn),
(i64 0)),
dsub)>;
+let Predicates = [UseWzrToVecMove] in {
def : Pat<(vector_insert (v8f16 V128:$Rn), (f16 fpimm0), (i64 VectorIndexH:$imm)),
(INSvi16gpr V128:$Rn, VectorIndexH:$imm, WZR)>;
def : Pat<(vector_insert (v4f16 V64:$Rn), (f16 fpimm0), (i64 VectorIndexH:$imm)),
@@ -7386,6 +7389,7 @@ def : Pat<(vector_insert (v2f32 V64:$Rn), (f32 fpimm0), (i64 VectorIndexS:$imm))
(EXTRACT_SUBREG (INSvi32gpr (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), V64:$Rn, dsub)), VectorIndexS:$imm, WZR), dsub)>;
def : Pat<(vector_insert v2f64:$Rn, (f64 fpimm0), (i64 VectorIndexD:$imm)),
(INSvi64gpr V128:$Rn, VectorIndexS:$imm, XZR)>;
+}
def : Pat<(v8f16 (vector_insert (v8f16 V128:$Rn),
(f16 FPR16:$Rm), (i64 VectorIndexH:$imm))),
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index 5379305bc7a7f..adc984ad795af 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -21,40 +21,46 @@ def TuneA320 : SubtargetFeature<"a320", "ARMProcFamily", "CortexA320",
"Cortex-A320 ARM processors", [
FeatureFuseAES,
FeatureFuseAdrpAdd,
- FeaturePostRAScheduler]>;
+ FeaturePostRAScheduler,
+ FeatureUseWzrToVecMove]>;
def TuneA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
"Cortex-A53 ARM processors", [
FeatureFuseAES,
FeatureFuseAdrpAdd,
FeatureBalanceFPOps,
- FeaturePostRAScheduler]>;
+ FeaturePostRAScheduler,
+ FeatureUseWzrToVecMove]>;
def TuneA55 : SubtargetFeature<"a55", "ARMProcFamily", "CortexA55",
"Cortex-A55 ARM processors", [
FeatureFuseAES,
FeatureFuseAdrpAdd,
FeaturePostRAScheduler,
- FeatureFuseAddress]>;
+ FeatureFuseAddress,
+ FeatureUseWzrToVecMove]>;
def TuneA510 : SubtargetFeature<"a510", "ARMProcFamily", "CortexA510",
"Cortex-A510 ARM processors", [
FeatureFuseAES,
FeatureFuseAdrpAdd,
- FeaturePostRAScheduler
+ FeaturePostRAScheduler,
+ FeatureUseWzrToVecMove
]>;
def TuneA520 : SubtargetFeature<"a520", "ARMProcFamily", "CortexA520",
"Cortex-A520 ARM processors", [
FeatureFuseAES,
FeatureFuseAdrpAdd,
- FeaturePostRAScheduler]>;
+ FeaturePostRAScheduler,
+ FeatureUseWzrToVecMove]>;
def TuneA520AE : SubtargetFeature<"a520ae", "ARMProcFamily", "CortexA520",
"Cortex-A520AE ARM processors", [
FeatureFuseAES,
FeatureFuseAdrpAdd,
- FeaturePostRAScheduler]>;
+ FeaturePostRAScheduler,
+ FeatureUseWzrToVecMove]>;
def TuneA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
"Cortex-A57 ARM processors", [
diff --git a/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll b/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll
index ff28c7817d143..bae254bbd2104 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll
@@ -172,8 +172,9 @@ define <8 x half> @test_insert_v8f16_insert_1(half %a) {
; CHECK-LABEL: test_insert_v8f16_insert_1:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $h0 killed $h0 def $q0
+; CHECK-NEXT: movi d1, #0000000000000000
; CHECK-NEXT: dup.8h v0, v0[0]
-; CHECK-NEXT: mov.h v0[7], wzr
+; CHECK-NEXT: mov.h v0[7], v1[0]
; CHECK-NEXT: ret
%v.0 = insertelement <8 x half> <half undef, half undef, half undef, half undef, half undef, half undef, half undef, half 0.0>, half %a, i32 0
%v.1 = insertelement <8 x half> %v.0, half %a, i32 1
@@ -278,8 +279,9 @@ define <4 x float> @test_insert_3_f32_undef_zero_vector(float %a) {
; CHECK-LABEL: test_insert_3_f32_undef_zero_vector:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT: movi d1, #0000000000000000
; CHECK-NEXT: dup.4s v0, v0[0]
-; CHECK-NEXT: mov.s v0[3], wzr
+; CHECK-NEXT: mov.s v0[3], v1[0]
; CHECK-NEXT: ret
%v.0 = insertelement <4 x float> <float undef, float undef, float undef, float 0.000000e+00>, float %a, i32 0
%v.1 = insertelement <4 x float> %v.0, float %a, i32 1
@@ -347,12 +349,12 @@ define <8 x i16> @test_insert_v8i16_i16_zero(<8 x i16> %a) {
ret <8 x i16> %v.0
}
-; TODO: This should jsut be a mov.s v0[3], wzr
define <4 x half> @test_insert_v4f16_f16_zero(<4 x half> %a) {
; CHECK-LABEL: test_insert_v4f16_f16_zero:
; CHECK: // %bb.0:
+; CHECK-NEXT: movi d1, #0000000000000000
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: mov.h v0[0], wzr
+; CHECK-NEXT: mov.h v0[0], v1[0]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
%v.0 = insertelement <4 x half> %a, half 0.000000e+00, i32 0
@@ -362,7 +364,8 @@ define <4 x half> @test_insert_v4f16_f16_zero(<4 x half> %a) {
define <8 x half> @test_insert_v8f16_f16_zero(<8 x half> %a) {
; CHECK-LABEL: test_insert_v8f16_f16_zero:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov.h v0[6], wzr
+; CHECK-NEXT: movi d1, #0000000000000000
+; CHECK-NEXT: mov.h v0[6], v1[0]
; CHECK-NEXT: ret
%v.0 = insertelement <8 x half> %a, half 0.000000e+00, i32 6
ret <8 x half> %v.0
@@ -371,8 +374,9 @@ define <8 x half> @test_insert_v8f16_f16_zero(<8 x half> %a) {
define <2 x float> @test_insert_v2f32_f32_zero(<2 x float> %a) {
; CHECK-LABEL: test_insert_v2f32_f32_zero:
; CHECK: // %bb.0:
+; CHECK-NEXT: movi d1, #0000000000000000
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: mov.s v0[0], wzr
+; CHECK-NEXT: mov.s v0[0], v1[0]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
%v.0 = insertelement <2 x float> %a, float 0.000000e+00, i32 0
@@ -382,7 +386,8 @@ define <2 x float> @test_insert_v2f32_f32_zero(<2 x float> %a) {
define <4 x float> @test_insert_v4f32_f32_zero(<4 x float> %a) {
; CHECK-LABEL: test_insert_v4f32_f32_zero:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov.s v0[3], wzr
+; CHECK-NEXT: movi d1, #0000000000000000
+; CHECK-NEXT: mov.s v0[3], v1[0]
; CHECK-NEXT: ret
%v.0 = insertelement <4 x float> %a, float 0.000000e+00, i32 3
ret <4 x float> %v.0
@@ -391,8 +396,60 @@ define <4 x float> @test_insert_v4f32_f32_zero(<4 x float> %a) {
define <2 x double> @test_insert_v2f64_f64_zero(<2 x double> %a) {
; CHECK-LABEL: test_insert_v2f64_f64_zero:
; CHECK: // %bb.0:
+; CHECK-NEXT: movi d1, #0000000000000000
+; CHECK-NEXT: mov.d v0[1], v1[0]
+; CHECK-NEXT: ret
+ %v.0 = insertelement <2 x double> %a, double 0.000000e+00, i32 1
+ ret <2 x double> %v.0
+}
+
+define <4 x half> @test_insert_v4f16_f16_zero_wzr(<4 x half> %a) #1 {
+; CHECK-LABEL: test_insert_v4f16_f16_zero_wzr:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: mov.h v0[0], wzr
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
+ %v.0 = insertelement <4 x half> %a, half 0.000000e+00, i32 0
+ ret <4 x half> %v.0
+}
+
+define <8 x half> @test_insert_v8f16_f16_zero_wzr(<8 x half> %a) #1 {
+; CHECK-LABEL: test_insert_v8f16_f16_zero_wzr:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov.h v0[6], wzr
+; CHECK-NEXT: ret
+ %v.0 = insertelement <8 x half> %a, half 0.000000e+00, i32 6
+ ret <8 x half> %v.0
+}
+
+define <2 x float> @test_insert_v2f32_f32_zero_wzr(<2 x float> %a) #1 {
+; CHECK-LABEL: test_insert_v2f32_f32_zero_wzr:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: mov.s v0[0], wzr
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
+ %v.0 = insertelement <2 x float> %a, float 0.000000e+00, i32 0
+ ret <2 x float> %v.0
+}
+
+define <4 x float> @test_insert_v4f32_f32_zero_wzr(<4 x float> %a) #1 {
+; CHECK-LABEL: test_insert_v4f32_f32_zero_wzr:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov.s v0[3], wzr
+; CHECK-NEXT: ret
+ %v.0 = insertelement <4 x float> %a, float 0.000000e+00, i32 3
+ ret <4 x float> %v.0
+}
+
+define <2 x double> @test_insert_v2f64_f64_zero_xzr(<2 x double> %a) #1 {
+; CHECK-LABEL: test_insert_v2f64_f64_zero_xzr:
+; CHECK: // %bb.0:
; CHECK-NEXT: mov.d v0[1], xzr
; CHECK-NEXT: ret
%v.0 = insertelement <2 x double> %a, double 0.000000e+00, i32 1
ret <2 x double> %v.0
}
+
+attributes #1 = {"tune-cpu"="cortex-a55"}
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
index 8a84d3ca2328c..59dfcf9850a49 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
@@ -63,8 +63,9 @@ define half @add_v3HalfH(<3 x half> %bin.rdx) {
;
; CHECK-SD-FP16-LABEL: add_v3HalfH:
; CHECK-SD-FP16: // %bb.0:
+; CHECK-SD-FP16-NEXT: movi d1, #0000000000000000
; CHECK-SD-FP16-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-FP16-NEXT: mov v0.h[3], wzr
+; CHECK-SD-FP16-NEXT: mov v0.h[3], v1.h[0]
; CHECK-SD-FP16-NEXT: faddp v0.4h, v0.4h, v0.4h
; CHECK-SD-FP16-NEXT: faddp h0, v0.2h
; CHECK-SD-FP16-NEXT: ret
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM. This is a small codegen improvement that should be low risk to pick if we are still happy to pick things like this
Backport e333d60
Requested by: @juliannagele