Skip to content

Commit 78e6120

Browse files
committed
[AArch64] Split zero cycle zeoring per register class (llvm#154561)
This change improves LLVM's model accuracy by splitting AArch64 subtarget features of zero cycle zeroing per register class. This aligns with how uarch is designed (each register bank has unique capabilities). Similarly to how we improved ZCM modeling. It splits `HasZeroCycleZeroingGP` to `HasZeroCycleZeroingGPR32` and `HasZeroCycleZeroingGPR64`, removes opaque `FeatureZCZeroing`, and infers `FeatureNoZCZeroingFP` to be `FeatureNoZCZeroingFPR64` based on the single usage in `AArch64AsmPrinter.cpp`. It also splits `arm64-zero-cycle-zeroing.ll` into 2 tests one `-gpr` and one `-fpr`, similarly to ZCM, to make the tests more focused and managable in correspondance with the new modeling. The test cases are updated as well, exlpoiting the fact that this is a refactor patch: - remove redundant functions that just mix isolated ones (t1-4) - specialize check prefixes - replace `apple-a10` with `apple-m1` - add a `-mtriple=arm64-apple-macosx -mcpu=generic` test case for GPR - isolate `mtriple=arm64-apple-ios -mcpu=cyclone` FP workaround test case and move `-fullfp16` to another non-workaround test case (cherry-pick c3c24be)
1 parent b2709ef commit 78e6120

File tree

9 files changed

+241
-266
lines changed

9 files changed

+241
-266
lines changed

llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1893,8 +1893,8 @@ void AArch64AsmPrinter::emitMOVK(Register Dest, uint64_t Imm, unsigned Shift) {
18931893

18941894
void AArch64AsmPrinter::emitFMov0(const MachineInstr &MI) {
18951895
Register DestReg = MI.getOperand(0).getReg();
1896-
if (STI->hasZeroCycleZeroingFP() && !STI->hasZeroCycleZeroingFPWorkaround() &&
1897-
STI->isNeonAvailable()) {
1896+
if (STI->hasZeroCycleZeroingFPR64() &&
1897+
!STI->hasZeroCycleZeroingFPWorkaround() && STI->isNeonAvailable()) {
18981898
// Convert H/S register to corresponding D register
18991899
if (AArch64::H0 <= DestReg && DestReg <= AArch64::H31)
19001900
DestReg = AArch64::D0 + (DestReg - AArch64::H0);

llvm/lib/Target/AArch64/AArch64Features.td

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -630,19 +630,18 @@ def FeatureZCRegMoveFPR64 : SubtargetFeature<"zcm-fpr64", "HasZeroCycleRegMoveFP
630630
def FeatureZCRegMoveFPR32 : SubtargetFeature<"zcm-fpr32", "HasZeroCycleRegMoveFPR32", "true",
631631
"Has zero-cycle register moves for FPR32 registers">;
632632

633-
def FeatureZCZeroingGP : SubtargetFeature<"zcz-gp", "HasZeroCycleZeroingGP", "true",
634-
"Has zero-cycle zeroing instructions for generic registers">;
633+
def FeatureZCZeroingGPR64 : SubtargetFeature<"zcz-gpr64", "HasZeroCycleZeroingGPR64", "true",
634+
"Has zero-cycle zeroing instructions for GPR64 registers">;
635+
636+
def FeatureZCZeroingGPR32 : SubtargetFeature<"zcz-gpr32", "HasZeroCycleZeroingGPR32", "true",
637+
"Has zero-cycle zeroing instructions for GPR32 registers">;
635638

636639
// It is generally beneficial to rewrite "fmov s0, wzr" to "movi d0, #0".
637640
// as movi is more efficient across all cores. Newer cores can eliminate
638641
// fmovs early and there is no difference with movi, but this not true for
639642
// all implementations.
640-
def FeatureNoZCZeroingFP : SubtargetFeature<"no-zcz-fp", "HasZeroCycleZeroingFP", "false",
641-
"Has no zero-cycle zeroing instructions for FP registers">;
642-
643-
def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true",
644-
"Has zero-cycle zeroing instructions",
645-
[FeatureZCZeroingGP]>;
643+
def FeatureNoZCZeroingFPR64 : SubtargetFeature<"no-zcz-fpr64", "HasZeroCycleZeroingFPR64", "false",
644+
"Has no zero-cycle zeroing instructions for FPR64 registers">;
646645

647646
/// ... but the floating-point version doesn't quite work in rare cases on older
648647
/// CPUs.

llvm/lib/Target/AArch64/AArch64InstrInfo.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5069,7 +5069,7 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
50695069
.addImm(0)
50705070
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
50715071
}
5072-
} else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) {
5072+
} else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGPR32()) {
50735073
BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
50745074
.addImm(0)
50755075
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
@@ -5196,7 +5196,7 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
51965196
.addReg(SrcReg, getKillRegState(KillSrc))
51975197
.addImm(0)
51985198
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
5199-
} else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) {
5199+
} else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGPR64()) {
52005200
BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
52015201
.addImm(0)
52025202
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));

llvm/lib/Target/AArch64/AArch64Processors.td

Lines changed: 28 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -313,7 +313,8 @@ def TuneAppleA7 : SubtargetFeature<"apple-a7", "ARMProcFamily", "AppleA7",
313313
FeatureStorePairSuppress,
314314
FeatureZCRegMoveGPR64,
315315
FeatureZCRegMoveFPR128,
316-
FeatureZCZeroing,
316+
FeatureZCZeroingGPR32,
317+
FeatureZCZeroingGPR64,
317318
FeatureZCZeroingFPWorkaround]>;
318319

319320
def TuneAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10",
@@ -327,7 +328,8 @@ def TuneAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10",
327328
FeatureStorePairSuppress,
328329
FeatureZCRegMoveGPR64,
329330
FeatureZCRegMoveFPR128,
330-
FeatureZCZeroing]>;
331+
FeatureZCZeroingGPR32,
332+
FeatureZCZeroingGPR64]>;
331333

332334
def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11",
333335
"Apple A11", [
@@ -340,7 +342,8 @@ def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11",
340342
FeatureStorePairSuppress,
341343
FeatureZCRegMoveGPR64,
342344
FeatureZCRegMoveFPR128,
343-
FeatureZCZeroing]>;
345+
FeatureZCZeroingGPR32,
346+
FeatureZCZeroingGPR64]>;
344347

345348
def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12",
346349
"Apple A12", [
@@ -353,7 +356,8 @@ def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12",
353356
FeatureStorePairSuppress,
354357
FeatureZCRegMoveGPR64,
355358
FeatureZCRegMoveFPR128,
356-
FeatureZCZeroing]>;
359+
FeatureZCZeroingGPR32,
360+
FeatureZCZeroingGPR64]>;
357361

358362
def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13",
359363
"Apple A13", [
@@ -366,7 +370,8 @@ def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13",
366370
FeatureStorePairSuppress,
367371
FeatureZCRegMoveGPR64,
368372
FeatureZCRegMoveFPR128,
369-
FeatureZCZeroing]>;
373+
FeatureZCZeroingGPR32,
374+
FeatureZCZeroingGPR64]>;
370375

371376
def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
372377
"Apple A14", [
@@ -384,7 +389,8 @@ def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
384389
FeatureStorePairSuppress,
385390
FeatureZCRegMoveGPR64,
386391
FeatureZCRegMoveFPR128,
387-
FeatureZCZeroing]>;
392+
FeatureZCZeroingGPR32,
393+
FeatureZCZeroingGPR64]>;
388394

389395
def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
390396
"Apple A15", [
@@ -402,7 +408,8 @@ def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
402408
FeatureStorePairSuppress,
403409
FeatureZCRegMoveGPR64,
404410
FeatureZCRegMoveFPR128,
405-
FeatureZCZeroing]>;
411+
FeatureZCZeroingGPR32,
412+
FeatureZCZeroingGPR64]>;
406413

407414
def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
408415
"Apple A16", [
@@ -420,7 +427,8 @@ def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
420427
FeatureStorePairSuppress,
421428
FeatureZCRegMoveGPR64,
422429
FeatureZCRegMoveFPR128,
423-
FeatureZCZeroing]>;
430+
FeatureZCZeroingGPR32,
431+
FeatureZCZeroingGPR64]>;
424432

425433
def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
426434
"Apple A17", [
@@ -438,7 +446,8 @@ def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
438446
FeatureStorePairSuppress,
439447
FeatureZCRegMoveGPR64,
440448
FeatureZCRegMoveFPR128,
441-
FeatureZCZeroing]>;
449+
FeatureZCZeroingGPR32,
450+
FeatureZCZeroingGPR64]>;
442451

443452
def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
444453
"Apple M4", [
@@ -455,8 +464,8 @@ def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
455464
FeatureFuseLiterals,
456465
FeatureZCRegMoveGPR64,
457466
FeatureZCRegMoveFPR128,
458-
FeatureZCZeroing
459-
]>;
467+
FeatureZCZeroingGPR32,
468+
FeatureZCZeroingGPR64]>;
460469

461470
def TuneExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
462471
"Samsung Exynos-M3 processors",
@@ -488,21 +497,24 @@ def TuneExynosM4 : SubtargetFeature<"exynosm4", "ARMProcFamily", "ExynosM3",
488497
FeatureStorePairSuppress,
489498
FeatureALULSLFast,
490499
FeaturePostRAScheduler,
491-
FeatureZCZeroing]>;
500+
FeatureZCZeroingGPR32,
501+
FeatureZCZeroingGPR64]>;
492502

493503
def TuneKryo : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
494504
"Qualcomm Kryo processors", [
495505
FeaturePostRAScheduler,
496506
FeaturePredictableSelectIsExpensive,
497-
FeatureZCZeroing,
507+
FeatureZCZeroingGPR32,
508+
FeatureZCZeroingGPR64,
498509
FeatureALULSLFast,
499510
FeatureStorePairSuppress]>;
500511

501512
def TuneFalkor : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
502513
"Qualcomm Falkor processors", [
503514
FeaturePostRAScheduler,
504515
FeaturePredictableSelectIsExpensive,
505-
FeatureZCZeroing,
516+
FeatureZCZeroingGPR32,
517+
FeatureZCZeroingGPR64,
506518
FeatureStorePairSuppress,
507519
FeatureALULSLFast,
508520
FeatureSlowSTRQro]>;
@@ -598,7 +610,8 @@ def TuneSaphira : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira",
598610
"Qualcomm Saphira processors", [
599611
FeaturePostRAScheduler,
600612
FeaturePredictableSelectIsExpensive,
601-
FeatureZCZeroing,
613+
FeatureZCZeroingGPR32,
614+
FeatureZCZeroingGPR64,
602615
FeatureStorePairSuppress,
603616
FeatureALULSLFast]>;
604617

llvm/test/CodeGen/AArch64/arm64-copy-phys-zero-reg.mir

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
11
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
2-
# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,-zcm-gpr64,-zcz" %s \
2+
# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,-zcm-gpr64,-zcz-gpr32,-zcz-gpr64" %s \
33
# RUN: | FileCheck --check-prefix=CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ %s
4-
# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,-zcm-gpr64,-zcz" %s \
4+
# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,-zcm-gpr64,-zcz-gpr32,-zcz-gpr64" %s \
55
# RUN: | FileCheck --check-prefix=CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ %s
6-
# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,+zcm-gpr64,-zcz" %s \
6+
# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,+zcm-gpr64,-zcz-gpr32,-zcz-gpr64" %s \
77
# RUN: | FileCheck --check-prefix=CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ %s
8-
# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,+zcm-gpr64,-zcz" %s \
8+
# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,+zcm-gpr64,-zcz-gpr32,-zcz-gpr64" %s \
99
# RUN: | FileCheck --check-prefix=CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ %s
10-
# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,-zcm-gpr64,+zcz" %s \
10+
# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,-zcm-gpr64,+zcz-gpr32,+zcz-gpr64" %s \
1111
# RUN: | FileCheck --check-prefix=CHECK-NO-ZCM-ZCZ %s
12-
# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,+zcm-gpr64,+zcz" %s \
12+
# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,+zcm-gpr64,+zcz-gpr32,+zcz-gpr64" %s \
1313
# RUN: | FileCheck --check-prefix=CHECK-ZCM-ZCZ %s
1414

1515
--- |
Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+no-zcz-fpr64 | FileCheck %s -check-prefixes=ALL,NOZCZ-FPR64
2+
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+no-zcz-fpr64,+fullfp16 | FileCheck %s -check-prefixes=ALL,NOZCZ-FPR64-FULLFP16
3+
; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64
4+
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+fullfp16 | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64
5+
; RUN: llc < %s -mtriple=arm64-apple-ios -mcpu=cyclone | FileCheck %s -check-prefixes=ALL,FP-WORKAROUND
6+
; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64
7+
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64
8+
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=kryo | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64
9+
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=falkor | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64
10+
11+
define half @tf16() {
12+
entry:
13+
; ALL-LABEL: tf16:
14+
; FP-WORKAROUND: mov s0, wzr
15+
; NOZCZ-FPR64: mov s0, wzr
16+
; NOZCZ-FPR64-FULLFP16: mov h0, wzr
17+
; ZCZ-FPR64: movi d0, #0
18+
ret half 0.0
19+
}
20+
21+
define float @tf32() {
22+
entry:
23+
; ALL-LABEL: tf32:
24+
; FP-WORKAROUND: mov s0, wzr
25+
; NOZCZ-FPR64: mov s0, wzr
26+
; ZCZ-FPR64: movi d0, #0
27+
ret float 0.0
28+
}
29+
30+
define double @td64() {
31+
entry:
32+
; ALL-LABEL: td64:
33+
; FP-WORKAROUND: mov d0, xzr
34+
; NOZCZ-FPR64: mov d0, xzr
35+
; ZCZ-FPR64: movi d0, #0
36+
ret double 0.0
37+
}
38+
39+
define <8 x i8> @tv8i8() {
40+
entry:
41+
; ALL-LABEL: tv8i8:
42+
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
43+
; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
44+
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
45+
ret <8 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
46+
}
47+
48+
define <4 x i16> @tv4i16() {
49+
entry:
50+
; ALL-LABEL: tv4i16:
51+
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
52+
; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
53+
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
54+
ret <4 x i16> <i16 0, i16 0, i16 0, i16 0>
55+
}
56+
57+
define <2 x i32> @tv2i32() {
58+
entry:
59+
; ALL-LABEL: tv2i32:
60+
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
61+
; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
62+
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
63+
ret <2 x i32> <i32 0, i32 0>
64+
}
65+
66+
define <2 x float> @tv2f32() {
67+
entry:
68+
; ALL-LABEL: tv2f32:
69+
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
70+
; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
71+
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
72+
ret <2 x float> <float 0.0, float 0.0>
73+
}
74+
75+
define <16 x i8> @tv16i8() {
76+
entry:
77+
; ALL-LABEL: tv16i8:
78+
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
79+
; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
80+
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
81+
ret <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
82+
}
83+
84+
define <8 x i16> @tv8i16() {
85+
entry:
86+
; ALL-LABEL: tv8i16:
87+
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
88+
; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
89+
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
90+
ret <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
91+
}
92+
93+
define <4 x i32> @tv4i32() {
94+
entry:
95+
; ALL-LABEL: tv4i32:
96+
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
97+
; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
98+
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
99+
ret <4 x i32> <i32 0, i32 0, i32 0, i32 0>
100+
}
101+
102+
define <2 x i64> @tv2i64() {
103+
entry:
104+
; ALL-LABEL: tv2i64:
105+
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
106+
; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
107+
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
108+
ret <2 x i64> <i64 0, i64 0>
109+
}
110+
111+
define <4 x float> @tv4f32() {
112+
entry:
113+
; ALL-LABEL: tv4f32:
114+
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
115+
; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
116+
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
117+
ret <4 x float> <float 0.0, float 0.0, float 0.0, float 0.0>
118+
}
119+
120+
define <2 x double> @tv2d64() {
121+
entry:
122+
; ALL-LABEL: tv2d64:
123+
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
124+
; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
125+
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
126+
ret <2 x double> <double 0.0, double 0.0>
127+
}
128+
129+
; We used to produce spills+reloads for a Q register with zero cycle zeroing
130+
; enabled.
131+
; ALL-LABEL: foo:
132+
; ALL-NOT: str q{{[0-9]+}}
133+
; ALL-NOT: ldr q{{[0-9]+}}
134+
define double @foo(i32 %n) {
135+
entry:
136+
br label %for.body
137+
138+
for.body:
139+
%phi0 = phi double [ 1.0, %entry ], [ %v0, %for.body ]
140+
%i.076 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
141+
%conv21 = sitofp i32 %i.076 to double
142+
%call = tail call fast double @sin(double %conv21)
143+
%cmp.i = fcmp fast olt double %phi0, %call
144+
%v0 = select i1 %cmp.i, double %call, double %phi0
145+
%inc = add nuw nsw i32 %i.076, 1
146+
%cmp = icmp slt i32 %inc, %n
147+
br i1 %cmp, label %for.body, label %for.end
148+
149+
for.end:
150+
ret double %v0
151+
}
152+
153+
declare double @sin(double)

0 commit comments

Comments
 (0)