Skip to content

Commit 40952a5

Browse files
committed
[AArch64] Lower zero cycle FPR zeroing (llvm#156261)
Lower FPR64, FPR32, FPR16 from `fmov` zeroing into NEON zeroing if the target supports zero cycle zeroing of NEON registers but not for the narrower classes. It handles 2 cases: one in `AsmPrinter` where a FP zeroing from immediate has been captured by pattern matching on instruction selection, and second post RA in `AArch64InstrInfo::copyPhysReg` for uncaptured/later-generated WZR/XZR fmovs. Adds a subtarget feature called FeatureZCZeroingFPR128 that enables to query wether the target supports zero cycle zeroing for FPR128 NEON registers, and updates the appropriate processors. (cherry-pick f059d2b)
1 parent 78e6120 commit 40952a5

15 files changed

+255
-177
lines changed

llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp

Lines changed: 69 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -315,6 +315,7 @@ class AArch64AsmPrinter : public AsmPrinter {
315315

316316
/// Emit instruction to set float register to zero.
317317
void emitFMov0(const MachineInstr &MI);
318+
void emitFMov0AsFMov(const MachineInstr &MI, Register DestReg);
318319

319320
using MInstToMCSymbol = std::map<const MachineInstr *, MCSymbol *>;
320321

@@ -1893,45 +1894,77 @@ void AArch64AsmPrinter::emitMOVK(Register Dest, uint64_t Imm, unsigned Shift) {
18931894

18941895
void AArch64AsmPrinter::emitFMov0(const MachineInstr &MI) {
18951896
Register DestReg = MI.getOperand(0).getReg();
1896-
if (STI->hasZeroCycleZeroingFPR64() &&
1897-
!STI->hasZeroCycleZeroingFPWorkaround() && STI->isNeonAvailable()) {
1898-
// Convert H/S register to corresponding D register
1899-
if (AArch64::H0 <= DestReg && DestReg <= AArch64::H31)
1900-
DestReg = AArch64::D0 + (DestReg - AArch64::H0);
1901-
else if (AArch64::S0 <= DestReg && DestReg <= AArch64::S31)
1902-
DestReg = AArch64::D0 + (DestReg - AArch64::S0);
1903-
else
1904-
assert(AArch64::D0 <= DestReg && DestReg <= AArch64::D31);
1897+
if (!STI->hasZeroCycleZeroingFPWorkaround() && STI->isNeonAvailable()) {
1898+
if (STI->hasZeroCycleZeroingFPR64()) {
1899+
// Convert H/S register to corresponding D register
1900+
const AArch64RegisterInfo *TRI = STI->getRegisterInfo();
1901+
if (AArch64::FPR16RegClass.contains(DestReg))
1902+
DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::hsub,
1903+
&AArch64::FPR64RegClass);
1904+
else if (AArch64::FPR32RegClass.contains(DestReg))
1905+
DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::ssub,
1906+
&AArch64::FPR64RegClass);
1907+
else
1908+
assert(AArch64::FPR64RegClass.contains(DestReg));
1909+
1910+
MCInst MOVI;
1911+
MOVI.setOpcode(AArch64::MOVID);
1912+
MOVI.addOperand(MCOperand::createReg(DestReg));
1913+
MOVI.addOperand(MCOperand::createImm(0));
1914+
EmitToStreamer(*OutStreamer, MOVI);
1915+
} else if (STI->hasZeroCycleZeroingFPR128()) {
1916+
// Convert H/S/D register to corresponding Q register
1917+
const AArch64RegisterInfo *TRI = STI->getRegisterInfo();
1918+
if (AArch64::FPR16RegClass.contains(DestReg)) {
1919+
DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::hsub,
1920+
&AArch64::FPR128RegClass);
1921+
} else if (AArch64::FPR32RegClass.contains(DestReg)) {
1922+
DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::ssub,
1923+
&AArch64::FPR128RegClass);
1924+
} else {
1925+
assert(AArch64::FPR64RegClass.contains(DestReg));
1926+
DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::dsub,
1927+
&AArch64::FPR128RegClass);
1928+
}
19051929

1906-
MCInst MOVI;
1907-
MOVI.setOpcode(AArch64::MOVID);
1908-
MOVI.addOperand(MCOperand::createReg(DestReg));
1909-
MOVI.addOperand(MCOperand::createImm(0));
1910-
EmitToStreamer(*OutStreamer, MOVI);
1911-
} else {
1912-
MCInst FMov;
1913-
switch (MI.getOpcode()) {
1914-
default: llvm_unreachable("Unexpected opcode");
1915-
case AArch64::FMOVH0:
1916-
FMov.setOpcode(STI->hasFullFP16() ? AArch64::FMOVWHr : AArch64::FMOVWSr);
1917-
if (!STI->hasFullFP16())
1918-
DestReg = (AArch64::S0 + (DestReg - AArch64::H0));
1919-
FMov.addOperand(MCOperand::createReg(DestReg));
1920-
FMov.addOperand(MCOperand::createReg(AArch64::WZR));
1921-
break;
1922-
case AArch64::FMOVS0:
1923-
FMov.setOpcode(AArch64::FMOVWSr);
1924-
FMov.addOperand(MCOperand::createReg(DestReg));
1925-
FMov.addOperand(MCOperand::createReg(AArch64::WZR));
1926-
break;
1927-
case AArch64::FMOVD0:
1928-
FMov.setOpcode(AArch64::FMOVXDr);
1929-
FMov.addOperand(MCOperand::createReg(DestReg));
1930-
FMov.addOperand(MCOperand::createReg(AArch64::XZR));
1931-
break;
1930+
MCInst MOVI;
1931+
MOVI.setOpcode(AArch64::MOVIv2d_ns);
1932+
MOVI.addOperand(MCOperand::createReg(DestReg));
1933+
MOVI.addOperand(MCOperand::createImm(0));
1934+
EmitToStreamer(*OutStreamer, MOVI);
1935+
} else {
1936+
emitFMov0AsFMov(MI, DestReg);
19321937
}
1933-
EmitToStreamer(*OutStreamer, FMov);
1938+
} else {
1939+
emitFMov0AsFMov(MI, DestReg);
1940+
}
1941+
}
1942+
1943+
void AArch64AsmPrinter::emitFMov0AsFMov(const MachineInstr &MI,
1944+
Register DestReg) {
1945+
MCInst FMov;
1946+
switch (MI.getOpcode()) {
1947+
default:
1948+
llvm_unreachable("Unexpected opcode");
1949+
case AArch64::FMOVH0:
1950+
FMov.setOpcode(STI->hasFullFP16() ? AArch64::FMOVWHr : AArch64::FMOVWSr);
1951+
if (!STI->hasFullFP16())
1952+
DestReg = (AArch64::S0 + (DestReg - AArch64::H0));
1953+
FMov.addOperand(MCOperand::createReg(DestReg));
1954+
FMov.addOperand(MCOperand::createReg(AArch64::WZR));
1955+
break;
1956+
case AArch64::FMOVS0:
1957+
FMov.setOpcode(AArch64::FMOVWSr);
1958+
FMov.addOperand(MCOperand::createReg(DestReg));
1959+
FMov.addOperand(MCOperand::createReg(AArch64::WZR));
1960+
break;
1961+
case AArch64::FMOVD0:
1962+
FMov.setOpcode(AArch64::FMOVXDr);
1963+
FMov.addOperand(MCOperand::createReg(DestReg));
1964+
FMov.addOperand(MCOperand::createReg(AArch64::XZR));
1965+
break;
19341966
}
1967+
EmitToStreamer(*OutStreamer, FMov);
19351968
}
19361969

19371970
Register AArch64AsmPrinter::emitPtrauthDiscriminator(uint16_t Disc,

llvm/lib/Target/AArch64/AArch64Features.td

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -636,6 +636,9 @@ def FeatureZCZeroingGPR64 : SubtargetFeature<"zcz-gpr64", "HasZeroCycleZeroingGP
636636
def FeatureZCZeroingGPR32 : SubtargetFeature<"zcz-gpr32", "HasZeroCycleZeroingGPR32", "true",
637637
"Has zero-cycle zeroing instructions for GPR32 registers">;
638638

639+
def FeatureZCZeroingFPR128 : SubtargetFeature<"zcz-fpr128", "HasZeroCycleZeroingFPR128", "true",
640+
"Has zero-cycle zeroing instructions for FPR128 registers">;
641+
639642
// It is generally beneficial to rewrite "fmov s0, wzr" to "movi d0, #0".
640643
// as movi is more efficient across all cores. Newer cores can eliminate
641644
// fmovs early and there is no difference with movi, but this not true for

llvm/lib/Target/AArch64/AArch64InstrInfo.cpp

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5463,8 +5463,12 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
54635463
// Copies between GPR64 and FPR64.
54645464
if (AArch64::FPR64RegClass.contains(DestReg) &&
54655465
AArch64::GPR64RegClass.contains(SrcReg)) {
5466-
BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
5467-
.addReg(SrcReg, getKillRegState(KillSrc));
5466+
if (AArch64::XZR == SrcReg) {
5467+
BuildMI(MBB, I, DL, get(AArch64::FMOVD0), DestReg);
5468+
} else {
5469+
BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
5470+
.addReg(SrcReg, getKillRegState(KillSrc));
5471+
}
54685472
return;
54695473
}
54705474
if (AArch64::GPR64RegClass.contains(DestReg) &&
@@ -5476,8 +5480,12 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
54765480
// Copies between GPR32 and FPR32.
54775481
if (AArch64::FPR32RegClass.contains(DestReg) &&
54785482
AArch64::GPR32RegClass.contains(SrcReg)) {
5479-
BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
5480-
.addReg(SrcReg, getKillRegState(KillSrc));
5483+
if (AArch64::WZR == SrcReg) {
5484+
BuildMI(MBB, I, DL, get(AArch64::FMOVS0), DestReg);
5485+
} else {
5486+
BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
5487+
.addReg(SrcReg, getKillRegState(KillSrc));
5488+
}
54815489
return;
54825490
}
54835491
if (AArch64::GPR32RegClass.contains(DestReg) &&

llvm/lib/Target/AArch64/AArch64Processors.td

Lines changed: 29 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -315,6 +315,8 @@ def TuneAppleA7 : SubtargetFeature<"apple-a7", "ARMProcFamily", "AppleA7",
315315
FeatureZCRegMoveFPR128,
316316
FeatureZCZeroingGPR32,
317317
FeatureZCZeroingGPR64,
318+
FeatureNoZCZeroingFPR64,
319+
FeatureZCZeroingFPR128,
318320
FeatureZCZeroingFPWorkaround]>;
319321

320322
def TuneAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10",
@@ -329,7 +331,9 @@ def TuneAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10",
329331
FeatureZCRegMoveGPR64,
330332
FeatureZCRegMoveFPR128,
331333
FeatureZCZeroingGPR32,
332-
FeatureZCZeroingGPR64]>;
334+
FeatureZCZeroingGPR64,
335+
FeatureNoZCZeroingFPR64,
336+
FeatureZCZeroingFPR128]>;
333337

334338
def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11",
335339
"Apple A11", [
@@ -343,7 +347,9 @@ def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11",
343347
FeatureZCRegMoveGPR64,
344348
FeatureZCRegMoveFPR128,
345349
FeatureZCZeroingGPR32,
346-
FeatureZCZeroingGPR64]>;
350+
FeatureZCZeroingGPR64,
351+
FeatureNoZCZeroingFPR64,
352+
FeatureZCZeroingFPR128]>;
347353

348354
def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12",
349355
"Apple A12", [
@@ -357,7 +363,9 @@ def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12",
357363
FeatureZCRegMoveGPR64,
358364
FeatureZCRegMoveFPR128,
359365
FeatureZCZeroingGPR32,
360-
FeatureZCZeroingGPR64]>;
366+
FeatureZCZeroingGPR64,
367+
FeatureNoZCZeroingFPR64,
368+
FeatureZCZeroingFPR128]>;
361369

362370
def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13",
363371
"Apple A13", [
@@ -371,7 +379,9 @@ def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13",
371379
FeatureZCRegMoveGPR64,
372380
FeatureZCRegMoveFPR128,
373381
FeatureZCZeroingGPR32,
374-
FeatureZCZeroingGPR64]>;
382+
FeatureZCZeroingGPR64,
383+
FeatureNoZCZeroingFPR64,
384+
FeatureZCZeroingFPR128]>;
375385

376386
def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
377387
"Apple A14", [
@@ -390,7 +400,9 @@ def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
390400
FeatureZCRegMoveGPR64,
391401
FeatureZCRegMoveFPR128,
392402
FeatureZCZeroingGPR32,
393-
FeatureZCZeroingGPR64]>;
403+
FeatureZCZeroingGPR64,
404+
FeatureNoZCZeroingFPR64,
405+
FeatureZCZeroingFPR128]>;
394406

395407
def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
396408
"Apple A15", [
@@ -409,7 +421,9 @@ def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
409421
FeatureZCRegMoveGPR64,
410422
FeatureZCRegMoveFPR128,
411423
FeatureZCZeroingGPR32,
412-
FeatureZCZeroingGPR64]>;
424+
FeatureZCZeroingGPR64,
425+
FeatureNoZCZeroingFPR64,
426+
FeatureZCZeroingFPR128]>;
413427

414428
def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
415429
"Apple A16", [
@@ -428,7 +442,9 @@ def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
428442
FeatureZCRegMoveGPR64,
429443
FeatureZCRegMoveFPR128,
430444
FeatureZCZeroingGPR32,
431-
FeatureZCZeroingGPR64]>;
445+
FeatureZCZeroingGPR64,
446+
FeatureNoZCZeroingFPR64,
447+
FeatureZCZeroingFPR128]>;
432448

433449
def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
434450
"Apple A17", [
@@ -447,7 +463,9 @@ def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
447463
FeatureZCRegMoveGPR64,
448464
FeatureZCRegMoveFPR128,
449465
FeatureZCZeroingGPR32,
450-
FeatureZCZeroingGPR64]>;
466+
FeatureZCZeroingGPR64,
467+
FeatureNoZCZeroingFPR64,
468+
FeatureZCZeroingFPR128]>;
451469

452470
def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
453471
"Apple M4", [
@@ -465,7 +483,9 @@ def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
465483
FeatureZCRegMoveGPR64,
466484
FeatureZCRegMoveFPR128,
467485
FeatureZCZeroingGPR32,
468-
FeatureZCZeroingGPR64]>;
486+
FeatureZCZeroingGPR64,
487+
FeatureNoZCZeroingFPR64,
488+
FeatureZCZeroingFPR128]>;
469489

470490
def TuneExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
471491
"Samsung Exynos-M3 processors",

llvm/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,14 +69,14 @@ define double @add_sub_su64(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
6969
; CHECK-LABEL: add_sub_su64:
7070
; CHECK: // %bb.0:
7171
; CHECK-NEXT: add d0, d1, d0
72-
; CHECK-NEXT: fmov d1, xzr
72+
; CHECK-NEXT: movi d1, #0000000000000000
7373
; CHECK-NEXT: sub d0, d1, d0
7474
; CHECK-NEXT: ret
7575
;
7676
; GENERIC-LABEL: add_sub_su64:
7777
; GENERIC: // %bb.0:
7878
; GENERIC-NEXT: add d0, d1, d0
79-
; GENERIC-NEXT: fmov d1, xzr
79+
; GENERIC-NEXT: movi d1, #0000000000000000
8080
; GENERIC-NEXT: sub d0, d1, d0
8181
; GENERIC-NEXT: ret
8282
%vecext = extractelement <2 x i64> %a, i32 0

0 commit comments

Comments
 (0)