Skip to content

[AArch64] Replace expensive move from wzr by two moves via floating point immediate #146538

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jul 16, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions llvm/lib/Target/AArch64/AArch64Features.td
Original file line number Diff line number Diff line change
Expand Up @@ -840,6 +840,13 @@ def FeatureDisableFastIncVL : SubtargetFeature<"disable-fast-inc-vl",
"HasDisableFastIncVL", "true",
"Do not prefer INC/DEC, ALL, { 1, 2, 4 } over ADDVL">;

// On most processors we want to avoid moving from WZR to vector registers
// (relying on materializing 0 to a FPR and moving from there instead),
// but on some (in-order) cores it's preferable to avoid the extra instruction instead.
def FeatureUseWzrToVecMove : SubtargetFeature<"use-wzr-to-vec-move",
"UseWzrToVecMove", "true",
"Move from WZR to insert 0 into vector registers">;

//===----------------------------------------------------------------------===//
// Architectures.
//
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AArch64/AArch64InstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -419,6 +419,8 @@ def AArch64LocalRecover : SDNode<"ISD::LOCAL_RECOVER",

def AllowMisalignedMemAccesses : Predicate<"!Subtarget->requiresStrictAlign()">;

def UseWzrToVecMove : Predicate<"Subtarget->useWzrToVecMove()">;


//===----------------------------------------------------------------------===//
// AArch64-specific DAG Nodes.
Expand Down Expand Up @@ -7377,6 +7379,7 @@ def : Pat<(v4f16 (vector_insert (v4f16 V64:$Rn),
(i64 0)),
dsub)>;

let Predicates = [UseWzrToVecMove] in {
def : Pat<(vector_insert (v8f16 V128:$Rn), (f16 fpimm0), (i64 VectorIndexH:$imm)),
(INSvi16gpr V128:$Rn, VectorIndexH:$imm, WZR)>;
def : Pat<(vector_insert (v4f16 V64:$Rn), (f16 fpimm0), (i64 VectorIndexH:$imm)),
Expand All @@ -7387,6 +7390,7 @@ def : Pat<(vector_insert (v2f32 V64:$Rn), (f32 fpimm0), (i64 VectorIndexS:$imm))
(EXTRACT_SUBREG (INSvi32gpr (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), V64:$Rn, dsub)), VectorIndexS:$imm, WZR), dsub)>;
def : Pat<(vector_insert v2f64:$Rn, (f64 fpimm0), (i64 VectorIndexD:$imm)),
(INSvi64gpr V128:$Rn, VectorIndexS:$imm, XZR)>;
}

def : Pat<(v8f16 (vector_insert (v8f16 V128:$Rn),
(f16 FPR16:$Rm), (i64 VectorIndexH:$imm))),
Expand Down
18 changes: 12 additions & 6 deletions llvm/lib/Target/AArch64/AArch64Processors.td
Original file line number Diff line number Diff line change
Expand Up @@ -21,40 +21,46 @@ def TuneA320 : SubtargetFeature<"a320", "ARMProcFamily", "CortexA320",
"Cortex-A320 ARM processors", [
FeatureFuseAES,
FeatureFuseAdrpAdd,
FeaturePostRAScheduler]>;
FeaturePostRAScheduler,
FeatureUseWzrToVecMove]>;

def TuneA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
"Cortex-A53 ARM processors", [
FeatureFuseAES,
FeatureFuseAdrpAdd,
FeatureBalanceFPOps,
FeaturePostRAScheduler]>;
FeaturePostRAScheduler,
FeatureUseWzrToVecMove]>;

def TuneA55 : SubtargetFeature<"a55", "ARMProcFamily", "CortexA55",
"Cortex-A55 ARM processors", [
FeatureFuseAES,
FeatureFuseAdrpAdd,
FeaturePostRAScheduler,
FeatureFuseAddress]>;
FeatureFuseAddress,
FeatureUseWzrToVecMove]>;

def TuneA510 : SubtargetFeature<"a510", "ARMProcFamily", "CortexA510",
"Cortex-A510 ARM processors", [
FeatureFuseAES,
FeatureFuseAdrpAdd,
FeaturePostRAScheduler
FeaturePostRAScheduler,
FeatureUseWzrToVecMove
]>;

def TuneA520 : SubtargetFeature<"a520", "ARMProcFamily", "CortexA520",
"Cortex-A520 ARM processors", [
FeatureFuseAES,
FeatureFuseAdrpAdd,
FeaturePostRAScheduler]>;
FeaturePostRAScheduler,
FeatureUseWzrToVecMove]>;

def TuneA520AE : SubtargetFeature<"a520ae", "ARMProcFamily", "CortexA520",
"Cortex-A520AE ARM processors", [
FeatureFuseAES,
FeatureFuseAdrpAdd,
FeaturePostRAScheduler]>;
FeaturePostRAScheduler,
FeatureUseWzrToVecMove]>;

def TuneA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
"Cortex-A57 ARM processors", [
Expand Down
71 changes: 64 additions & 7 deletions llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll
Original file line number Diff line number Diff line change
Expand Up @@ -172,8 +172,9 @@ define <8 x half> @test_insert_v8f16_insert_1(half %a) {
; CHECK-LABEL: test_insert_v8f16_insert_1:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $h0 killed $h0 def $q0
; CHECK-NEXT: movi d1, #0000000000000000
; CHECK-NEXT: dup.8h v0, v0[0]
; CHECK-NEXT: mov.h v0[7], wzr
; CHECK-NEXT: mov.h v0[7], v1[0]
; CHECK-NEXT: ret
%v.0 = insertelement <8 x half> <half undef, half undef, half undef, half undef, half undef, half undef, half undef, half 0.0>, half %a, i32 0
%v.1 = insertelement <8 x half> %v.0, half %a, i32 1
Expand Down Expand Up @@ -278,8 +279,9 @@ define <4 x float> @test_insert_3_f32_undef_zero_vector(float %a) {
; CHECK-LABEL: test_insert_3_f32_undef_zero_vector:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0
; CHECK-NEXT: movi d1, #0000000000000000
; CHECK-NEXT: dup.4s v0, v0[0]
; CHECK-NEXT: mov.s v0[3], wzr
; CHECK-NEXT: mov.s v0[3], v1[0]
; CHECK-NEXT: ret
%v.0 = insertelement <4 x float> <float undef, float undef, float undef, float 0.000000e+00>, float %a, i32 0
%v.1 = insertelement <4 x float> %v.0, float %a, i32 1
Expand Down Expand Up @@ -347,12 +349,12 @@ define <8 x i16> @test_insert_v8i16_i16_zero(<8 x i16> %a) {
ret <8 x i16> %v.0
}

; TODO: This should jsut be a mov.s v0[3], wzr
define <4 x half> @test_insert_v4f16_f16_zero(<4 x half> %a) {
; CHECK-LABEL: test_insert_v4f16_f16_zero:
; CHECK: // %bb.0:
; CHECK-NEXT: movi d1, #0000000000000000
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: mov.h v0[0], wzr
; CHECK-NEXT: mov.h v0[0], v1[0]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
%v.0 = insertelement <4 x half> %a, half 0.000000e+00, i32 0
Expand All @@ -362,7 +364,8 @@ define <4 x half> @test_insert_v4f16_f16_zero(<4 x half> %a) {
define <8 x half> @test_insert_v8f16_f16_zero(<8 x half> %a) {
; CHECK-LABEL: test_insert_v8f16_f16_zero:
; CHECK: // %bb.0:
; CHECK-NEXT: mov.h v0[6], wzr
; CHECK-NEXT: movi d1, #0000000000000000
; CHECK-NEXT: mov.h v0[6], v1[0]
; CHECK-NEXT: ret
%v.0 = insertelement <8 x half> %a, half 0.000000e+00, i32 6
ret <8 x half> %v.0
Expand All @@ -371,8 +374,9 @@ define <8 x half> @test_insert_v8f16_f16_zero(<8 x half> %a) {
define <2 x float> @test_insert_v2f32_f32_zero(<2 x float> %a) {
; CHECK-LABEL: test_insert_v2f32_f32_zero:
; CHECK: // %bb.0:
; CHECK-NEXT: movi d1, #0000000000000000
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: mov.s v0[0], wzr
; CHECK-NEXT: mov.s v0[0], v1[0]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
%v.0 = insertelement <2 x float> %a, float 0.000000e+00, i32 0
Expand All @@ -382,7 +386,8 @@ define <2 x float> @test_insert_v2f32_f32_zero(<2 x float> %a) {
define <4 x float> @test_insert_v4f32_f32_zero(<4 x float> %a) {
; CHECK-LABEL: test_insert_v4f32_f32_zero:
; CHECK: // %bb.0:
; CHECK-NEXT: mov.s v0[3], wzr
; CHECK-NEXT: movi d1, #0000000000000000
; CHECK-NEXT: mov.s v0[3], v1[0]
; CHECK-NEXT: ret
%v.0 = insertelement <4 x float> %a, float 0.000000e+00, i32 3
ret <4 x float> %v.0
Expand All @@ -391,8 +396,60 @@ define <4 x float> @test_insert_v4f32_f32_zero(<4 x float> %a) {
define <2 x double> @test_insert_v2f64_f64_zero(<2 x double> %a) {
; CHECK-LABEL: test_insert_v2f64_f64_zero:
; CHECK: // %bb.0:
; CHECK-NEXT: movi d1, #0000000000000000
; CHECK-NEXT: mov.d v0[1], v1[0]
; CHECK-NEXT: ret
%v.0 = insertelement <2 x double> %a, double 0.000000e+00, i32 1
ret <2 x double> %v.0
}

define <4 x half> @test_insert_v4f16_f16_zero_wzr(<4 x half> %a) #1 {
; CHECK-LABEL: test_insert_v4f16_f16_zero_wzr:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: mov.h v0[0], wzr
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
%v.0 = insertelement <4 x half> %a, half 0.000000e+00, i32 0
ret <4 x half> %v.0
}

define <8 x half> @test_insert_v8f16_f16_zero_wzr(<8 x half> %a) #1 {
; CHECK-LABEL: test_insert_v8f16_f16_zero_wzr:
; CHECK: // %bb.0:
; CHECK-NEXT: mov.h v0[6], wzr
; CHECK-NEXT: ret
%v.0 = insertelement <8 x half> %a, half 0.000000e+00, i32 6
ret <8 x half> %v.0
}

define <2 x float> @test_insert_v2f32_f32_zero_wzr(<2 x float> %a) #1 {
; CHECK-LABEL: test_insert_v2f32_f32_zero_wzr:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: mov.s v0[0], wzr
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
%v.0 = insertelement <2 x float> %a, float 0.000000e+00, i32 0
ret <2 x float> %v.0
}

define <4 x float> @test_insert_v4f32_f32_zero_wzr(<4 x float> %a) #1 {
; CHECK-LABEL: test_insert_v4f32_f32_zero_wzr:
; CHECK: // %bb.0:
; CHECK-NEXT: mov.s v0[3], wzr
; CHECK-NEXT: ret
%v.0 = insertelement <4 x float> %a, float 0.000000e+00, i32 3
ret <4 x float> %v.0
}

define <2 x double> @test_insert_v2f64_f64_zero_xzr(<2 x double> %a) #1 {
; CHECK-LABEL: test_insert_v2f64_f64_zero_xzr:
; CHECK: // %bb.0:
; CHECK-NEXT: mov.d v0[1], xzr
; CHECK-NEXT: ret
%v.0 = insertelement <2 x double> %a, double 0.000000e+00, i32 1
ret <2 x double> %v.0
}

attributes #1 = {"tune-cpu"="cortex-a55"}
3 changes: 2 additions & 1 deletion llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,9 @@ define half @add_v3HalfH(<3 x half> %bin.rdx) {
;
; CHECK-SD-FP16-LABEL: add_v3HalfH:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: movi d1, #0000000000000000
; CHECK-SD-FP16-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-SD-FP16-NEXT: mov v0.h[3], wzr
; CHECK-SD-FP16-NEXT: mov v0.h[3], v1.h[0]
; CHECK-SD-FP16-NEXT: faddp v0.4h, v0.4h, v0.4h
; CHECK-SD-FP16-NEXT: faddp h0, v0.2h
; CHECK-SD-FP16-NEXT: ret
Expand Down
Loading