Skip to content

Commit c3c24be

Browse files
authored
[AArch64] Split zero cycle zeoring per register class (#154561)
This change improves LLVM's model accuracy by splitting AArch64 subtarget features of zero cycle zeroing per register class. This aligns with how uarch is designed (each register bank has unique capabilities). Similarly to how we improved ZCM modeling. It splits `HasZeroCycleZeroingGP` to `HasZeroCycleZeroingGPR32` and `HasZeroCycleZeroingGPR64`, removes opaque `FeatureZCZeroing`, and infers `FeatureNoZCZeroingFP` to be `FeatureNoZCZeroingFPR64` based on the single usage in `AArch64AsmPrinter.cpp`. It also splits `arm64-zero-cycle-zeroing.ll` into 2 tests one `-gpr` and one `-fpr`, similarly to ZCM, to make the tests more focused and managable in correspondance with the new modeling. The test cases are updated as well, exlpoiting the fact that this is a refactor patch: - remove redundant functions that just mix isolated ones (t1-4) - specialize check prefixes - replace `apple-a10` with `apple-m1` - add a `-mtriple=arm64-apple-macosx -mcpu=generic` test case for GPR - isolate `mtriple=arm64-apple-ios -mcpu=cyclone` FP workaround test case and move `-fullfp16` to another non-workaround test case
1 parent c4b7715 commit c3c24be

9 files changed

+241
-266
lines changed

llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1829,8 +1829,8 @@ void AArch64AsmPrinter::emitMOVK(Register Dest, uint64_t Imm, unsigned Shift) {
18291829

18301830
void AArch64AsmPrinter::emitFMov0(const MachineInstr &MI) {
18311831
Register DestReg = MI.getOperand(0).getReg();
1832-
if (STI->hasZeroCycleZeroingFP() && !STI->hasZeroCycleZeroingFPWorkaround() &&
1833-
STI->isNeonAvailable()) {
1832+
if (STI->hasZeroCycleZeroingFPR64() &&
1833+
!STI->hasZeroCycleZeroingFPWorkaround() && STI->isNeonAvailable()) {
18341834
// Convert H/S register to corresponding D register
18351835
if (AArch64::H0 <= DestReg && DestReg <= AArch64::H31)
18361836
DestReg = AArch64::D0 + (DestReg - AArch64::H0);

llvm/lib/Target/AArch64/AArch64Features.td

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -630,19 +630,18 @@ def FeatureZCRegMoveFPR64 : SubtargetFeature<"zcm-fpr64", "HasZeroCycleRegMoveFP
630630
def FeatureZCRegMoveFPR32 : SubtargetFeature<"zcm-fpr32", "HasZeroCycleRegMoveFPR32", "true",
631631
"Has zero-cycle register moves for FPR32 registers">;
632632

633-
def FeatureZCZeroingGP : SubtargetFeature<"zcz-gp", "HasZeroCycleZeroingGP", "true",
634-
"Has zero-cycle zeroing instructions for generic registers">;
633+
def FeatureZCZeroingGPR64 : SubtargetFeature<"zcz-gpr64", "HasZeroCycleZeroingGPR64", "true",
634+
"Has zero-cycle zeroing instructions for GPR64 registers">;
635+
636+
def FeatureZCZeroingGPR32 : SubtargetFeature<"zcz-gpr32", "HasZeroCycleZeroingGPR32", "true",
637+
"Has zero-cycle zeroing instructions for GPR32 registers">;
635638

636639
// It is generally beneficial to rewrite "fmov s0, wzr" to "movi d0, #0".
637640
// as movi is more efficient across all cores. Newer cores can eliminate
638641
// fmovs early and there is no difference with movi, but this not true for
639642
// all implementations.
640-
def FeatureNoZCZeroingFP : SubtargetFeature<"no-zcz-fp", "HasZeroCycleZeroingFP", "false",
641-
"Has no zero-cycle zeroing instructions for FP registers">;
642-
643-
def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true",
644-
"Has zero-cycle zeroing instructions",
645-
[FeatureZCZeroingGP]>;
643+
def FeatureNoZCZeroingFPR64 : SubtargetFeature<"no-zcz-fpr64", "HasZeroCycleZeroingFPR64", "false",
644+
"Has no zero-cycle zeroing instructions for FPR64 registers">;
646645

647646
/// ... but the floating-point version doesn't quite work in rare cases on older
648647
/// CPUs.

llvm/lib/Target/AArch64/AArch64InstrInfo.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5075,7 +5075,7 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
50755075
.addImm(0)
50765076
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
50775077
}
5078-
} else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) {
5078+
} else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGPR32()) {
50795079
BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
50805080
.addImm(0)
50815081
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
@@ -5202,7 +5202,7 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
52025202
.addReg(SrcReg, getKillRegState(KillSrc))
52035203
.addImm(0)
52045204
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
5205-
} else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) {
5205+
} else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGPR64()) {
52065206
BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
52075207
.addImm(0)
52085208
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));

llvm/lib/Target/AArch64/AArch64Processors.td

Lines changed: 28 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -322,7 +322,8 @@ def TuneAppleA7 : SubtargetFeature<"apple-a7", "ARMProcFamily", "AppleA7",
322322
FeatureStorePairSuppress,
323323
FeatureZCRegMoveGPR64,
324324
FeatureZCRegMoveFPR128,
325-
FeatureZCZeroing,
325+
FeatureZCZeroingGPR32,
326+
FeatureZCZeroingGPR64,
326327
FeatureZCZeroingFPWorkaround]>;
327328

328329
def TuneAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10",
@@ -336,7 +337,8 @@ def TuneAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10",
336337
FeatureStorePairSuppress,
337338
FeatureZCRegMoveGPR64,
338339
FeatureZCRegMoveFPR128,
339-
FeatureZCZeroing]>;
340+
FeatureZCZeroingGPR32,
341+
FeatureZCZeroingGPR64]>;
340342

341343
def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11",
342344
"Apple A11", [
@@ -349,7 +351,8 @@ def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11",
349351
FeatureStorePairSuppress,
350352
FeatureZCRegMoveGPR64,
351353
FeatureZCRegMoveFPR128,
352-
FeatureZCZeroing]>;
354+
FeatureZCZeroingGPR32,
355+
FeatureZCZeroingGPR64]>;
353356

354357
def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12",
355358
"Apple A12", [
@@ -362,7 +365,8 @@ def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12",
362365
FeatureStorePairSuppress,
363366
FeatureZCRegMoveGPR64,
364367
FeatureZCRegMoveFPR128,
365-
FeatureZCZeroing]>;
368+
FeatureZCZeroingGPR32,
369+
FeatureZCZeroingGPR64]>;
366370

367371
def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13",
368372
"Apple A13", [
@@ -375,7 +379,8 @@ def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13",
375379
FeatureStorePairSuppress,
376380
FeatureZCRegMoveGPR64,
377381
FeatureZCRegMoveFPR128,
378-
FeatureZCZeroing]>;
382+
FeatureZCZeroingGPR32,
383+
FeatureZCZeroingGPR64]>;
379384

380385
def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
381386
"Apple A14", [
@@ -393,7 +398,8 @@ def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
393398
FeatureStorePairSuppress,
394399
FeatureZCRegMoveGPR64,
395400
FeatureZCRegMoveFPR128,
396-
FeatureZCZeroing]>;
401+
FeatureZCZeroingGPR32,
402+
FeatureZCZeroingGPR64]>;
397403

398404
def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
399405
"Apple A15", [
@@ -411,7 +417,8 @@ def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
411417
FeatureStorePairSuppress,
412418
FeatureZCRegMoveGPR64,
413419
FeatureZCRegMoveFPR128,
414-
FeatureZCZeroing]>;
420+
FeatureZCZeroingGPR32,
421+
FeatureZCZeroingGPR64]>;
415422

416423
def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
417424
"Apple A16", [
@@ -429,7 +436,8 @@ def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
429436
FeatureStorePairSuppress,
430437
FeatureZCRegMoveGPR64,
431438
FeatureZCRegMoveFPR128,
432-
FeatureZCZeroing]>;
439+
FeatureZCZeroingGPR32,
440+
FeatureZCZeroingGPR64]>;
433441

434442
def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
435443
"Apple A17", [
@@ -447,7 +455,8 @@ def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
447455
FeatureStorePairSuppress,
448456
FeatureZCRegMoveGPR64,
449457
FeatureZCRegMoveFPR128,
450-
FeatureZCZeroing]>;
458+
FeatureZCZeroingGPR32,
459+
FeatureZCZeroingGPR64]>;
451460

452461
def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
453462
"Apple M4", [
@@ -464,8 +473,8 @@ def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
464473
FeatureFuseLiterals,
465474
FeatureZCRegMoveGPR64,
466475
FeatureZCRegMoveFPR128,
467-
FeatureZCZeroing
468-
]>;
476+
FeatureZCZeroingGPR32,
477+
FeatureZCZeroingGPR64]>;
469478

470479
def TuneExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
471480
"Samsung Exynos-M3 processors",
@@ -497,21 +506,24 @@ def TuneExynosM4 : SubtargetFeature<"exynosm4", "ARMProcFamily", "ExynosM3",
497506
FeatureStorePairSuppress,
498507
FeatureALULSLFast,
499508
FeaturePostRAScheduler,
500-
FeatureZCZeroing]>;
509+
FeatureZCZeroingGPR32,
510+
FeatureZCZeroingGPR64]>;
501511

502512
def TuneKryo : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
503513
"Qualcomm Kryo processors", [
504514
FeaturePostRAScheduler,
505515
FeaturePredictableSelectIsExpensive,
506-
FeatureZCZeroing,
516+
FeatureZCZeroingGPR32,
517+
FeatureZCZeroingGPR64,
507518
FeatureALULSLFast,
508519
FeatureStorePairSuppress]>;
509520

510521
def TuneFalkor : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
511522
"Qualcomm Falkor processors", [
512523
FeaturePostRAScheduler,
513524
FeaturePredictableSelectIsExpensive,
514-
FeatureZCZeroing,
525+
FeatureZCZeroingGPR32,
526+
FeatureZCZeroingGPR64,
515527
FeatureStorePairSuppress,
516528
FeatureALULSLFast,
517529
FeatureSlowSTRQro]>;
@@ -607,7 +619,8 @@ def TuneSaphira : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira",
607619
"Qualcomm Saphira processors", [
608620
FeaturePostRAScheduler,
609621
FeaturePredictableSelectIsExpensive,
610-
FeatureZCZeroing,
622+
FeatureZCZeroingGPR32,
623+
FeatureZCZeroingGPR64,
611624
FeatureStorePairSuppress,
612625
FeatureALULSLFast]>;
613626

llvm/test/CodeGen/AArch64/arm64-copy-phys-zero-reg.mir

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
11
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
2-
# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,-zcm-gpr64,-zcz" %s \
2+
# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,-zcm-gpr64,-zcz-gpr32,-zcz-gpr64" %s \
33
# RUN: | FileCheck --check-prefix=CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ %s
4-
# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,-zcm-gpr64,-zcz" %s \
4+
# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,-zcm-gpr64,-zcz-gpr32,-zcz-gpr64" %s \
55
# RUN: | FileCheck --check-prefix=CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ %s
6-
# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,+zcm-gpr64,-zcz" %s \
6+
# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,+zcm-gpr64,-zcz-gpr32,-zcz-gpr64" %s \
77
# RUN: | FileCheck --check-prefix=CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ %s
8-
# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,+zcm-gpr64,-zcz" %s \
8+
# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,+zcm-gpr64,-zcz-gpr32,-zcz-gpr64" %s \
99
# RUN: | FileCheck --check-prefix=CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ %s
10-
# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,-zcm-gpr64,+zcz" %s \
10+
# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,-zcm-gpr64,+zcz-gpr32,+zcz-gpr64" %s \
1111
# RUN: | FileCheck --check-prefix=CHECK-NO-ZCM-ZCZ %s
12-
# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,+zcm-gpr64,+zcz" %s \
12+
# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,+zcm-gpr64,+zcz-gpr32,+zcz-gpr64" %s \
1313
# RUN: | FileCheck --check-prefix=CHECK-ZCM-ZCZ %s
1414

1515
--- |
Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+no-zcz-fpr64 | FileCheck %s -check-prefixes=ALL,NOZCZ-FPR64
2+
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+no-zcz-fpr64,+fullfp16 | FileCheck %s -check-prefixes=ALL,NOZCZ-FPR64-FULLFP16
3+
; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64
4+
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+fullfp16 | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64
5+
; RUN: llc < %s -mtriple=arm64-apple-ios -mcpu=cyclone | FileCheck %s -check-prefixes=ALL,FP-WORKAROUND
6+
; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64
7+
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64
8+
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=kryo | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64
9+
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=falkor | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64
10+
11+
define half @tf16() {
12+
entry:
13+
; ALL-LABEL: tf16:
14+
; FP-WORKAROUND: mov s0, wzr
15+
; NOZCZ-FPR64: mov s0, wzr
16+
; NOZCZ-FPR64-FULLFP16: mov h0, wzr
17+
; ZCZ-FPR64: movi d0, #0
18+
ret half 0.0
19+
}
20+
21+
define float @tf32() {
22+
entry:
23+
; ALL-LABEL: tf32:
24+
; FP-WORKAROUND: mov s0, wzr
25+
; NOZCZ-FPR64: mov s0, wzr
26+
; ZCZ-FPR64: movi d0, #0
27+
ret float 0.0
28+
}
29+
30+
define double @td64() {
31+
entry:
32+
; ALL-LABEL: td64:
33+
; FP-WORKAROUND: mov d0, xzr
34+
; NOZCZ-FPR64: mov d0, xzr
35+
; ZCZ-FPR64: movi d0, #0
36+
ret double 0.0
37+
}
38+
39+
define <8 x i8> @tv8i8() {
40+
entry:
41+
; ALL-LABEL: tv8i8:
42+
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
43+
; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
44+
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
45+
ret <8 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
46+
}
47+
48+
define <4 x i16> @tv4i16() {
49+
entry:
50+
; ALL-LABEL: tv4i16:
51+
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
52+
; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
53+
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
54+
ret <4 x i16> <i16 0, i16 0, i16 0, i16 0>
55+
}
56+
57+
define <2 x i32> @tv2i32() {
58+
entry:
59+
; ALL-LABEL: tv2i32:
60+
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
61+
; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
62+
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
63+
ret <2 x i32> <i32 0, i32 0>
64+
}
65+
66+
define <2 x float> @tv2f32() {
67+
entry:
68+
; ALL-LABEL: tv2f32:
69+
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
70+
; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
71+
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
72+
ret <2 x float> <float 0.0, float 0.0>
73+
}
74+
75+
define <16 x i8> @tv16i8() {
76+
entry:
77+
; ALL-LABEL: tv16i8:
78+
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
79+
; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
80+
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
81+
ret <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
82+
}
83+
84+
define <8 x i16> @tv8i16() {
85+
entry:
86+
; ALL-LABEL: tv8i16:
87+
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
88+
; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
89+
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
90+
ret <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
91+
}
92+
93+
define <4 x i32> @tv4i32() {
94+
entry:
95+
; ALL-LABEL: tv4i32:
96+
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
97+
; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
98+
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
99+
ret <4 x i32> <i32 0, i32 0, i32 0, i32 0>
100+
}
101+
102+
define <2 x i64> @tv2i64() {
103+
entry:
104+
; ALL-LABEL: tv2i64:
105+
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
106+
; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
107+
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
108+
ret <2 x i64> <i64 0, i64 0>
109+
}
110+
111+
define <4 x float> @tv4f32() {
112+
entry:
113+
; ALL-LABEL: tv4f32:
114+
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
115+
; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
116+
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
117+
ret <4 x float> <float 0.0, float 0.0, float 0.0, float 0.0>
118+
}
119+
120+
define <2 x double> @tv2d64() {
121+
entry:
122+
; ALL-LABEL: tv2d64:
123+
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
124+
; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
125+
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
126+
ret <2 x double> <double 0.0, double 0.0>
127+
}
128+
129+
; We used to produce spills+reloads for a Q register with zero cycle zeroing
130+
; enabled.
131+
; ALL-LABEL: foo:
132+
; ALL-NOT: str q{{[0-9]+}}
133+
; ALL-NOT: ldr q{{[0-9]+}}
134+
define double @foo(i32 %n) {
135+
entry:
136+
br label %for.body
137+
138+
for.body:
139+
%phi0 = phi double [ 1.0, %entry ], [ %v0, %for.body ]
140+
%i.076 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
141+
%conv21 = sitofp i32 %i.076 to double
142+
%call = tail call fast double @sin(double %conv21)
143+
%cmp.i = fcmp fast olt double %phi0, %call
144+
%v0 = select i1 %cmp.i, double %call, double %phi0
145+
%inc = add nuw nsw i32 %i.076, 1
146+
%cmp = icmp slt i32 %inc, %n
147+
br i1 %cmp, label %for.body, label %for.end
148+
149+
for.end:
150+
ret double %v0
151+
}
152+
153+
declare double @sin(double)

0 commit comments

Comments
 (0)