Skip to content

Commit f059d2b

Browse files
authored
[AArch64] Lower zero cycle FPR zeroing (#156261)
Lower FPR64, FPR32, FPR16 from `fmov` zeroing into NEON zeroing if the target supports zero cycle zeroing of NEON registers but not for the narrower classes. It handles 2 cases: one in `AsmPrinter` where a FP zeroing from immediate has been captured by pattern matching on instruction selection, and second post RA in `AArch64InstrInfo::copyPhysReg` for uncaptured/later-generated WZR/XZR fmovs. Adds a subtarget feature called FeatureZCZeroingFPR128 that enables to query wether the target supports zero cycle zeroing for FPR128 NEON registers, and updates the appropriate processors.
1 parent 127d77d commit f059d2b

15 files changed

+249
-171
lines changed

llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp

Lines changed: 69 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -307,6 +307,7 @@ class AArch64AsmPrinter : public AsmPrinter {
307307

308308
/// Emit instruction to set float register to zero.
309309
void emitFMov0(const MachineInstr &MI);
310+
void emitFMov0AsFMov(const MachineInstr &MI, Register DestReg);
310311

311312
using MInstToMCSymbol = std::map<const MachineInstr *, MCSymbol *>;
312313

@@ -1829,45 +1830,77 @@ void AArch64AsmPrinter::emitMOVK(Register Dest, uint64_t Imm, unsigned Shift) {
18291830

18301831
void AArch64AsmPrinter::emitFMov0(const MachineInstr &MI) {
18311832
Register DestReg = MI.getOperand(0).getReg();
1832-
if (STI->hasZeroCycleZeroingFPR64() &&
1833-
!STI->hasZeroCycleZeroingFPWorkaround() && STI->isNeonAvailable()) {
1834-
// Convert H/S register to corresponding D register
1835-
if (AArch64::H0 <= DestReg && DestReg <= AArch64::H31)
1836-
DestReg = AArch64::D0 + (DestReg - AArch64::H0);
1837-
else if (AArch64::S0 <= DestReg && DestReg <= AArch64::S31)
1838-
DestReg = AArch64::D0 + (DestReg - AArch64::S0);
1839-
else
1840-
assert(AArch64::D0 <= DestReg && DestReg <= AArch64::D31);
1833+
if (!STI->hasZeroCycleZeroingFPWorkaround() && STI->isNeonAvailable()) {
1834+
if (STI->hasZeroCycleZeroingFPR64()) {
1835+
// Convert H/S register to corresponding D register
1836+
const AArch64RegisterInfo *TRI = STI->getRegisterInfo();
1837+
if (AArch64::FPR16RegClass.contains(DestReg))
1838+
DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::hsub,
1839+
&AArch64::FPR64RegClass);
1840+
else if (AArch64::FPR32RegClass.contains(DestReg))
1841+
DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::ssub,
1842+
&AArch64::FPR64RegClass);
1843+
else
1844+
assert(AArch64::FPR64RegClass.contains(DestReg));
1845+
1846+
MCInst MOVI;
1847+
MOVI.setOpcode(AArch64::MOVID);
1848+
MOVI.addOperand(MCOperand::createReg(DestReg));
1849+
MOVI.addOperand(MCOperand::createImm(0));
1850+
EmitToStreamer(*OutStreamer, MOVI);
1851+
} else if (STI->hasZeroCycleZeroingFPR128()) {
1852+
// Convert H/S/D register to corresponding Q register
1853+
const AArch64RegisterInfo *TRI = STI->getRegisterInfo();
1854+
if (AArch64::FPR16RegClass.contains(DestReg)) {
1855+
DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::hsub,
1856+
&AArch64::FPR128RegClass);
1857+
} else if (AArch64::FPR32RegClass.contains(DestReg)) {
1858+
DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::ssub,
1859+
&AArch64::FPR128RegClass);
1860+
} else {
1861+
assert(AArch64::FPR64RegClass.contains(DestReg));
1862+
DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::dsub,
1863+
&AArch64::FPR128RegClass);
1864+
}
18411865

1842-
MCInst MOVI;
1843-
MOVI.setOpcode(AArch64::MOVID);
1844-
MOVI.addOperand(MCOperand::createReg(DestReg));
1845-
MOVI.addOperand(MCOperand::createImm(0));
1846-
EmitToStreamer(*OutStreamer, MOVI);
1847-
} else {
1848-
MCInst FMov;
1849-
switch (MI.getOpcode()) {
1850-
default: llvm_unreachable("Unexpected opcode");
1851-
case AArch64::FMOVH0:
1852-
FMov.setOpcode(STI->hasFullFP16() ? AArch64::FMOVWHr : AArch64::FMOVWSr);
1853-
if (!STI->hasFullFP16())
1854-
DestReg = (AArch64::S0 + (DestReg - AArch64::H0));
1855-
FMov.addOperand(MCOperand::createReg(DestReg));
1856-
FMov.addOperand(MCOperand::createReg(AArch64::WZR));
1857-
break;
1858-
case AArch64::FMOVS0:
1859-
FMov.setOpcode(AArch64::FMOVWSr);
1860-
FMov.addOperand(MCOperand::createReg(DestReg));
1861-
FMov.addOperand(MCOperand::createReg(AArch64::WZR));
1862-
break;
1863-
case AArch64::FMOVD0:
1864-
FMov.setOpcode(AArch64::FMOVXDr);
1865-
FMov.addOperand(MCOperand::createReg(DestReg));
1866-
FMov.addOperand(MCOperand::createReg(AArch64::XZR));
1867-
break;
1866+
MCInst MOVI;
1867+
MOVI.setOpcode(AArch64::MOVIv2d_ns);
1868+
MOVI.addOperand(MCOperand::createReg(DestReg));
1869+
MOVI.addOperand(MCOperand::createImm(0));
1870+
EmitToStreamer(*OutStreamer, MOVI);
1871+
} else {
1872+
emitFMov0AsFMov(MI, DestReg);
18681873
}
1869-
EmitToStreamer(*OutStreamer, FMov);
1874+
} else {
1875+
emitFMov0AsFMov(MI, DestReg);
1876+
}
1877+
}
1878+
1879+
void AArch64AsmPrinter::emitFMov0AsFMov(const MachineInstr &MI,
1880+
Register DestReg) {
1881+
MCInst FMov;
1882+
switch (MI.getOpcode()) {
1883+
default:
1884+
llvm_unreachable("Unexpected opcode");
1885+
case AArch64::FMOVH0:
1886+
FMov.setOpcode(STI->hasFullFP16() ? AArch64::FMOVWHr : AArch64::FMOVWSr);
1887+
if (!STI->hasFullFP16())
1888+
DestReg = (AArch64::S0 + (DestReg - AArch64::H0));
1889+
FMov.addOperand(MCOperand::createReg(DestReg));
1890+
FMov.addOperand(MCOperand::createReg(AArch64::WZR));
1891+
break;
1892+
case AArch64::FMOVS0:
1893+
FMov.setOpcode(AArch64::FMOVWSr);
1894+
FMov.addOperand(MCOperand::createReg(DestReg));
1895+
FMov.addOperand(MCOperand::createReg(AArch64::WZR));
1896+
break;
1897+
case AArch64::FMOVD0:
1898+
FMov.setOpcode(AArch64::FMOVXDr);
1899+
FMov.addOperand(MCOperand::createReg(DestReg));
1900+
FMov.addOperand(MCOperand::createReg(AArch64::XZR));
1901+
break;
18701902
}
1903+
EmitToStreamer(*OutStreamer, FMov);
18711904
}
18721905

18731906
Register AArch64AsmPrinter::emitPtrauthDiscriminator(uint16_t Disc,

llvm/lib/Target/AArch64/AArch64Features.td

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -636,6 +636,9 @@ def FeatureZCZeroingGPR64 : SubtargetFeature<"zcz-gpr64", "HasZeroCycleZeroingGP
636636
def FeatureZCZeroingGPR32 : SubtargetFeature<"zcz-gpr32", "HasZeroCycleZeroingGPR32", "true",
637637
"Has zero-cycle zeroing instructions for GPR32 registers">;
638638

639+
def FeatureZCZeroingFPR128 : SubtargetFeature<"zcz-fpr128", "HasZeroCycleZeroingFPR128", "true",
640+
"Has zero-cycle zeroing instructions for FPR128 registers">;
641+
639642
// It is generally beneficial to rewrite "fmov s0, wzr" to "movi d0, #0".
640643
// as movi is more efficient across all cores. Newer cores can eliminate
641644
// fmovs early and there is no difference with movi, but this not true for

llvm/lib/Target/AArch64/AArch64InstrInfo.cpp

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5471,8 +5471,12 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
54715471
// Copies between GPR64 and FPR64.
54725472
if (AArch64::FPR64RegClass.contains(DestReg) &&
54735473
AArch64::GPR64RegClass.contains(SrcReg)) {
5474-
BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
5475-
.addReg(SrcReg, getKillRegState(KillSrc));
5474+
if (AArch64::XZR == SrcReg) {
5475+
BuildMI(MBB, I, DL, get(AArch64::FMOVD0), DestReg);
5476+
} else {
5477+
BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
5478+
.addReg(SrcReg, getKillRegState(KillSrc));
5479+
}
54765480
return;
54775481
}
54785482
if (AArch64::GPR64RegClass.contains(DestReg) &&
@@ -5484,8 +5488,12 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
54845488
// Copies between GPR32 and FPR32.
54855489
if (AArch64::FPR32RegClass.contains(DestReg) &&
54865490
AArch64::GPR32RegClass.contains(SrcReg)) {
5487-
BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
5488-
.addReg(SrcReg, getKillRegState(KillSrc));
5491+
if (AArch64::WZR == SrcReg) {
5492+
BuildMI(MBB, I, DL, get(AArch64::FMOVS0), DestReg);
5493+
} else {
5494+
BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
5495+
.addReg(SrcReg, getKillRegState(KillSrc));
5496+
}
54895497
return;
54905498
}
54915499
if (AArch64::GPR32RegClass.contains(DestReg) &&

llvm/lib/Target/AArch64/AArch64Processors.td

Lines changed: 29 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -344,6 +344,8 @@ def TuneAppleA7 : SubtargetFeature<"apple-a7", "ARMProcFamily", "AppleA7",
344344
FeatureZCRegMoveFPR128,
345345
FeatureZCZeroingGPR32,
346346
FeatureZCZeroingGPR64,
347+
FeatureNoZCZeroingFPR64,
348+
FeatureZCZeroingFPR128,
347349
FeatureZCZeroingFPWorkaround]>;
348350

349351
def TuneAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10",
@@ -358,7 +360,9 @@ def TuneAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10",
358360
FeatureZCRegMoveGPR64,
359361
FeatureZCRegMoveFPR128,
360362
FeatureZCZeroingGPR32,
361-
FeatureZCZeroingGPR64]>;
363+
FeatureZCZeroingGPR64,
364+
FeatureNoZCZeroingFPR64,
365+
FeatureZCZeroingFPR128]>;
362366

363367
def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11",
364368
"Apple A11", [
@@ -372,7 +376,9 @@ def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11",
372376
FeatureZCRegMoveGPR64,
373377
FeatureZCRegMoveFPR128,
374378
FeatureZCZeroingGPR32,
375-
FeatureZCZeroingGPR64]>;
379+
FeatureZCZeroingGPR64,
380+
FeatureNoZCZeroingFPR64,
381+
FeatureZCZeroingFPR128]>;
376382

377383
def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12",
378384
"Apple A12", [
@@ -386,7 +392,9 @@ def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12",
386392
FeatureZCRegMoveGPR64,
387393
FeatureZCRegMoveFPR128,
388394
FeatureZCZeroingGPR32,
389-
FeatureZCZeroingGPR64]>;
395+
FeatureZCZeroingGPR64,
396+
FeatureNoZCZeroingFPR64,
397+
FeatureZCZeroingFPR128]>;
390398

391399
def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13",
392400
"Apple A13", [
@@ -400,7 +408,9 @@ def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13",
400408
FeatureZCRegMoveGPR64,
401409
FeatureZCRegMoveFPR128,
402410
FeatureZCZeroingGPR32,
403-
FeatureZCZeroingGPR64]>;
411+
FeatureZCZeroingGPR64,
412+
FeatureNoZCZeroingFPR64,
413+
FeatureZCZeroingFPR128]>;
404414

405415
def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
406416
"Apple A14", [
@@ -419,7 +429,9 @@ def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
419429
FeatureZCRegMoveGPR64,
420430
FeatureZCRegMoveFPR128,
421431
FeatureZCZeroingGPR32,
422-
FeatureZCZeroingGPR64]>;
432+
FeatureZCZeroingGPR64,
433+
FeatureNoZCZeroingFPR64,
434+
FeatureZCZeroingFPR128]>;
423435

424436
def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
425437
"Apple A15", [
@@ -438,7 +450,9 @@ def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
438450
FeatureZCRegMoveGPR64,
439451
FeatureZCRegMoveFPR128,
440452
FeatureZCZeroingGPR32,
441-
FeatureZCZeroingGPR64]>;
453+
FeatureZCZeroingGPR64,
454+
FeatureNoZCZeroingFPR64,
455+
FeatureZCZeroingFPR128]>;
442456

443457
def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
444458
"Apple A16", [
@@ -457,7 +471,9 @@ def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
457471
FeatureZCRegMoveGPR64,
458472
FeatureZCRegMoveFPR128,
459473
FeatureZCZeroingGPR32,
460-
FeatureZCZeroingGPR64]>;
474+
FeatureZCZeroingGPR64,
475+
FeatureNoZCZeroingFPR64,
476+
FeatureZCZeroingFPR128]>;
461477

462478
def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
463479
"Apple A17", [
@@ -476,7 +492,9 @@ def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
476492
FeatureZCRegMoveGPR64,
477493
FeatureZCRegMoveFPR128,
478494
FeatureZCZeroingGPR32,
479-
FeatureZCZeroingGPR64]>;
495+
FeatureZCZeroingGPR64,
496+
FeatureNoZCZeroingFPR64,
497+
FeatureZCZeroingFPR128]>;
480498

481499
def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
482500
"Apple M4", [
@@ -494,7 +512,9 @@ def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
494512
FeatureZCRegMoveGPR64,
495513
FeatureZCRegMoveFPR128,
496514
FeatureZCZeroingGPR32,
497-
FeatureZCZeroingGPR64]>;
515+
FeatureZCZeroingGPR64,
516+
FeatureNoZCZeroingFPR64,
517+
FeatureZCZeroingFPR128]>;
498518

499519
def TuneExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
500520
"Samsung Exynos-M3 processors",

llvm/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,14 +69,14 @@ define double @add_sub_su64(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
6969
; CHECK-LABEL: add_sub_su64:
7070
; CHECK: // %bb.0:
7171
; CHECK-NEXT: add d0, d1, d0
72-
; CHECK-NEXT: fmov d1, xzr
72+
; CHECK-NEXT: movi d1, #0000000000000000
7373
; CHECK-NEXT: sub d0, d1, d0
7474
; CHECK-NEXT: ret
7575
;
7676
; GENERIC-LABEL: add_sub_su64:
7777
; GENERIC: // %bb.0:
7878
; GENERIC-NEXT: add d0, d1, d0
79-
; GENERIC-NEXT: fmov d1, xzr
79+
; GENERIC-NEXT: movi d1, #0000000000000000
8080
; GENERIC-NEXT: sub d0, d1, d0
8181
; GENERIC-NEXT: ret
8282
%vecext = extractelement <2 x i64> %a, i32 0

0 commit comments

Comments
 (0)