Skip to content

Commit 7be1083

Browse files
committed
fixup! [ISel] Replace expensive mov from wzr by two moves via fpr
Instead of introducing new patterns, guard exisiting ones by tuning feature
1 parent 44db4f1 commit 7be1083

File tree

4 files changed

+89
-26
lines changed

4 files changed

+89
-26
lines changed

llvm/lib/Target/AArch64/AArch64Features.td

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -834,6 +834,13 @@ def FeatureDisableFastIncVL : SubtargetFeature<"disable-fast-inc-vl",
834834
"HasDisableFastIncVL", "true",
835835
"Do not prefer INC/DEC, ALL, { 1, 2, 4 } over ADDVL">;
836836

837+
// On most processors we want to avoid moving from WZR to vector registers
838+
// (relying on materializing 0 to a FPR and moving from there instead),
839+
// but on some (in-order) cores it's preferable to avoid the extra instruction instead.
840+
def FeatureUseWzrToVecMove : SubtargetFeature<"use-wzr-to-vec-move",
841+
"UseWzrToVecMove", "true",
842+
"Move from WZR to insert 0 into vector registers">;
843+
837844
//===----------------------------------------------------------------------===//
838845
// Architectures.
839846
//

llvm/lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -413,6 +413,8 @@ def AArch64LocalRecover : SDNode<"ISD::LOCAL_RECOVER",
413413

414414
def AllowMisalignedMemAccesses : Predicate<"!Subtarget->requiresStrictAlign()">;
415415

416+
def UseWzrToVecMove : Predicate<"Subtarget->useWzrToVecMove()">;
417+
416418

417419
//===----------------------------------------------------------------------===//
418420
// AArch64-specific DAG Nodes.
@@ -7356,10 +7358,18 @@ def : Pat<(v4f16 (vector_insert (v4f16 V64:$Rn),
73567358
(i64 0)),
73577359
dsub)>;
73587360

7361+
let Predicates = [UseWzrToVecMove] in {
7362+
def : Pat<(vector_insert (v8f16 V128:$Rn), (f16 fpimm0), (i64 VectorIndexH:$imm)),
7363+
(INSvi16gpr V128:$Rn, VectorIndexH:$imm, WZR)>;
73597364
def : Pat<(vector_insert (v4f16 V64:$Rn), (f16 fpimm0), (i64 VectorIndexH:$imm)),
73607365
(EXTRACT_SUBREG (INSvi16gpr (v8f16 (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), V64:$Rn, dsub)), VectorIndexH:$imm, WZR), dsub)>;
7366+
def : Pat<(vector_insert (v4f32 V128:$Rn), (f32 fpimm0), (i64 VectorIndexS:$imm)),
7367+
(INSvi32gpr V128:$Rn, VectorIndexS:$imm, WZR)>;
73617368
def : Pat<(vector_insert (v2f32 V64:$Rn), (f32 fpimm0), (i64 VectorIndexS:$imm)),
73627369
(EXTRACT_SUBREG (INSvi32gpr (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), V64:$Rn, dsub)), VectorIndexS:$imm, WZR), dsub)>;
7370+
def : Pat<(vector_insert v2f64:$Rn, (f64 fpimm0), (i64 VectorIndexD:$imm)),
7371+
(INSvi64gpr V128:$Rn, VectorIndexS:$imm, XZR)>;
7372+
}
73637373

73647374
def : Pat<(v8f16 (vector_insert (v8f16 V128:$Rn),
73657375
(f16 FPR16:$Rm), (i64 VectorIndexH:$imm))),
@@ -8029,18 +8039,6 @@ def MOVIv2d_ns : SIMDModifiedImmVectorNoShift<1, 1, 0, 0b1110, V128,
80298039
"movi", ".2d",
80308040
[(set (v2i64 V128:$Rd), (AArch64movi_edit imm0_255:$imm8))]>;
80318041

8032-
def : Pat<(vector_insert (v8f16 V128:$Rn), (f16 fpimm0), (i64 VectorIndexH:$imm)),
8033-
(INSvi16lane V128:$Rn, VectorIndexH:$imm,
8034-
(v8f16 (MOVIv2d_ns (i32 0))), (i64 0))>;
8035-
8036-
def : Pat<(vector_insert (v4f32 V128:$Rn), (f32 fpimm0), (i64 VectorIndexS:$imm)),
8037-
(INSvi32lane V128:$Rn, VectorIndexS:$imm,
8038-
(v4f32 (MOVIv2d_ns (i32 0))), (i64 0))>;
8039-
8040-
def : Pat<(vector_insert (v2f64 V128:$Rn), (f64 fpimm0), (i64 VectorIndexD:$imm)),
8041-
(INSvi64lane V128:$Rn, VectorIndexD:$imm,
8042-
(v2f64 (MOVIv2d_ns (i32 0))), (i64 0))>;
8043-
80448042
let Predicates = [HasNEON] in {
80458043
def : Pat<(v2i64 immAllZerosV), (MOVIv2d_ns (i32 0))>;
80468044
def : Pat<(v4i32 immAllZerosV), (MOVIv2d_ns (i32 0))>;

llvm/lib/Target/AArch64/AArch64Processors.td

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,40 +21,46 @@ def TuneA320 : SubtargetFeature<"a320", "ARMProcFamily", "CortexA320",
2121
"Cortex-A320 ARM processors", [
2222
FeatureFuseAES,
2323
FeatureFuseAdrpAdd,
24-
FeaturePostRAScheduler]>;
24+
FeaturePostRAScheduler,
25+
FeatureUseWzrToVecMove]>;
2526

2627
def TuneA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
2728
"Cortex-A53 ARM processors", [
2829
FeatureFuseAES,
2930
FeatureFuseAdrpAdd,
3031
FeatureBalanceFPOps,
31-
FeaturePostRAScheduler]>;
32+
FeaturePostRAScheduler,
33+
FeatureUseWzrToVecMove]>;
3234

3335
def TuneA55 : SubtargetFeature<"a55", "ARMProcFamily", "CortexA55",
3436
"Cortex-A55 ARM processors", [
3537
FeatureFuseAES,
3638
FeatureFuseAdrpAdd,
3739
FeaturePostRAScheduler,
38-
FeatureFuseAddress]>;
40+
FeatureFuseAddress,
41+
FeatureUseWzrToVecMove]>;
3942

4043
def TuneA510 : SubtargetFeature<"a510", "ARMProcFamily", "CortexA510",
4144
"Cortex-A510 ARM processors", [
4245
FeatureFuseAES,
4346
FeatureFuseAdrpAdd,
44-
FeaturePostRAScheduler
47+
FeaturePostRAScheduler,
48+
FeatureUseWzrToVecMove
4549
]>;
4650

4751
def TuneA520 : SubtargetFeature<"a520", "ARMProcFamily", "CortexA520",
4852
"Cortex-A520 ARM processors", [
4953
FeatureFuseAES,
5054
FeatureFuseAdrpAdd,
51-
FeaturePostRAScheduler]>;
55+
FeaturePostRAScheduler,
56+
FeatureUseWzrToVecMove]>;
5257

5358
def TuneA520AE : SubtargetFeature<"a520ae", "ARMProcFamily", "CortexA520",
5459
"Cortex-A520AE ARM processors", [
5560
FeatureFuseAES,
5661
FeatureFuseAdrpAdd,
57-
FeaturePostRAScheduler]>;
62+
FeaturePostRAScheduler,
63+
FeatureUseWzrToVecMove]>;
5864

5965
def TuneA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
6066
"Cortex-A57 ARM processors", [

llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll

Lines changed: 60 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ define <8 x half> @test_insert_v8f16_insert_1(half %a) {
172172
; CHECK-LABEL: test_insert_v8f16_insert_1:
173173
; CHECK: // %bb.0:
174174
; CHECK-NEXT: // kill: def $h0 killed $h0 def $q0
175-
; CHECK-NEXT: movi.2d v1, #0000000000000000
175+
; CHECK-NEXT: movi d1, #0000000000000000
176176
; CHECK-NEXT: dup.8h v0, v0[0]
177177
; CHECK-NEXT: mov.h v0[7], v1[0]
178178
; CHECK-NEXT: ret
@@ -279,7 +279,7 @@ define <4 x float> @test_insert_3_f32_undef_zero_vector(float %a) {
279279
; CHECK-LABEL: test_insert_3_f32_undef_zero_vector:
280280
; CHECK: // %bb.0:
281281
; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0
282-
; CHECK-NEXT: movi.2d v1, #0000000000000000
282+
; CHECK-NEXT: movi d1, #0000000000000000
283283
; CHECK-NEXT: dup.4s v0, v0[0]
284284
; CHECK-NEXT: mov.s v0[3], v1[0]
285285
; CHECK-NEXT: ret
@@ -349,12 +349,12 @@ define <8 x i16> @test_insert_v8i16_i16_zero(<8 x i16> %a) {
349349
ret <8 x i16> %v.0
350350
}
351351

352-
; TODO: This should jsut be a mov.s v0[3], wzr
353352
define <4 x half> @test_insert_v4f16_f16_zero(<4 x half> %a) {
354353
; CHECK-LABEL: test_insert_v4f16_f16_zero:
355354
; CHECK: // %bb.0:
355+
; CHECK-NEXT: movi d1, #0000000000000000
356356
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
357-
; CHECK-NEXT: mov.h v0[0], wzr
357+
; CHECK-NEXT: mov.h v0[0], v1[0]
358358
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
359359
; CHECK-NEXT: ret
360360
%v.0 = insertelement <4 x half> %a, half 0.000000e+00, i32 0
@@ -364,7 +364,7 @@ define <4 x half> @test_insert_v4f16_f16_zero(<4 x half> %a) {
364364
define <8 x half> @test_insert_v8f16_f16_zero(<8 x half> %a) {
365365
; CHECK-LABEL: test_insert_v8f16_f16_zero:
366366
; CHECK: // %bb.0:
367-
; CHECK-NEXT: movi.2d v1, #0000000000000000
367+
; CHECK-NEXT: movi d1, #0000000000000000
368368
; CHECK-NEXT: mov.h v0[6], v1[0]
369369
; CHECK-NEXT: ret
370370
%v.0 = insertelement <8 x half> %a, half 0.000000e+00, i32 6
@@ -374,8 +374,9 @@ define <8 x half> @test_insert_v8f16_f16_zero(<8 x half> %a) {
374374
define <2 x float> @test_insert_v2f32_f32_zero(<2 x float> %a) {
375375
; CHECK-LABEL: test_insert_v2f32_f32_zero:
376376
; CHECK: // %bb.0:
377+
; CHECK-NEXT: movi d1, #0000000000000000
377378
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
378-
; CHECK-NEXT: mov.s v0[0], wzr
379+
; CHECK-NEXT: mov.s v0[0], v1[0]
379380
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
380381
; CHECK-NEXT: ret
381382
%v.0 = insertelement <2 x float> %a, float 0.000000e+00, i32 0
@@ -385,7 +386,7 @@ define <2 x float> @test_insert_v2f32_f32_zero(<2 x float> %a) {
385386
define <4 x float> @test_insert_v4f32_f32_zero(<4 x float> %a) {
386387
; CHECK-LABEL: test_insert_v4f32_f32_zero:
387388
; CHECK: // %bb.0:
388-
; CHECK-NEXT: movi.2d v1, #0000000000000000
389+
; CHECK-NEXT: movi d1, #0000000000000000
389390
; CHECK-NEXT: mov.s v0[3], v1[0]
390391
; CHECK-NEXT: ret
391392
%v.0 = insertelement <4 x float> %a, float 0.000000e+00, i32 3
@@ -395,9 +396,60 @@ define <4 x float> @test_insert_v4f32_f32_zero(<4 x float> %a) {
395396
define <2 x double> @test_insert_v2f64_f64_zero(<2 x double> %a) {
396397
; CHECK-LABEL: test_insert_v2f64_f64_zero:
397398
; CHECK: // %bb.0:
398-
; CHECK-NEXT: movi.2d v1, #0000000000000000
399+
; CHECK-NEXT: movi d1, #0000000000000000
399400
; CHECK-NEXT: mov.d v0[1], v1[0]
400401
; CHECK-NEXT: ret
401402
%v.0 = insertelement <2 x double> %a, double 0.000000e+00, i32 1
402403
ret <2 x double> %v.0
403404
}
405+
406+
define <4 x half> @test_insert_v4f16_f16_zero_wzr(<4 x half> %a) #1 {
407+
; CHECK-LABEL: test_insert_v4f16_f16_zero_wzr:
408+
; CHECK: // %bb.0:
409+
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
410+
; CHECK-NEXT: mov.h v0[0], wzr
411+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
412+
; CHECK-NEXT: ret
413+
%v.0 = insertelement <4 x half> %a, half 0.000000e+00, i32 0
414+
ret <4 x half> %v.0
415+
}
416+
417+
define <8 x half> @test_insert_v8f16_f16_zero_wzr(<8 x half> %a) #1 {
418+
; CHECK-LABEL: test_insert_v8f16_f16_zero_wzr:
419+
; CHECK: // %bb.0:
420+
; CHECK-NEXT: mov.h v0[6], wzr
421+
; CHECK-NEXT: ret
422+
%v.0 = insertelement <8 x half> %a, half 0.000000e+00, i32 6
423+
ret <8 x half> %v.0
424+
}
425+
426+
define <2 x float> @test_insert_v2f32_f32_zero_wzr(<2 x float> %a) #1 {
427+
; CHECK-LABEL: test_insert_v2f32_f32_zero_wzr:
428+
; CHECK: // %bb.0:
429+
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
430+
; CHECK-NEXT: mov.s v0[0], wzr
431+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
432+
; CHECK-NEXT: ret
433+
%v.0 = insertelement <2 x float> %a, float 0.000000e+00, i32 0
434+
ret <2 x float> %v.0
435+
}
436+
437+
define <4 x float> @test_insert_v4f32_f32_zero_wzr(<4 x float> %a) #1 {
438+
; CHECK-LABEL: test_insert_v4f32_f32_zero_wzr:
439+
; CHECK: // %bb.0:
440+
; CHECK-NEXT: mov.s v0[3], wzr
441+
; CHECK-NEXT: ret
442+
%v.0 = insertelement <4 x float> %a, float 0.000000e+00, i32 3
443+
ret <4 x float> %v.0
444+
}
445+
446+
define <2 x double> @test_insert_v2f64_f64_zero_xzr(<2 x double> %a) #1 {
447+
; CHECK-LABEL: test_insert_v2f64_f64_zero_xzr:
448+
; CHECK: // %bb.0:
449+
; CHECK-NEXT: mov.d v0[1], xzr
450+
; CHECK-NEXT: ret
451+
%v.0 = insertelement <2 x double> %a, double 0.000000e+00, i32 1
452+
ret <2 x double> %v.0
453+
}
454+
455+
attributes #1 = {"tune-cpu"="cortex-a55"}

0 commit comments

Comments
 (0)