Skip to content

Commit c4bf428

Browse files
authored
Merge pull request #11793 from swiftlang/cherrypick-aarch64-zcm-zcz-optimizations
Cherrypick aarch64 zcm zcz optimizations
2 parents abf7631 + 8639949 commit c4bf428

22 files changed

+893
-644
lines changed

llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp

Lines changed: 70 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -315,6 +315,7 @@ class AArch64AsmPrinter : public AsmPrinter {
315315

316316
/// Emit instruction to set float register to zero.
317317
void emitFMov0(const MachineInstr &MI);
318+
void emitFMov0AsFMov(const MachineInstr &MI, Register DestReg);
318319

319320
using MInstToMCSymbol = std::map<const MachineInstr *, MCSymbol *>;
320321

@@ -1893,45 +1894,77 @@ void AArch64AsmPrinter::emitMOVK(Register Dest, uint64_t Imm, unsigned Shift) {
18931894

18941895
void AArch64AsmPrinter::emitFMov0(const MachineInstr &MI) {
18951896
Register DestReg = MI.getOperand(0).getReg();
1896-
if (STI->hasZeroCycleZeroingFP() && !STI->hasZeroCycleZeroingFPWorkaround() &&
1897-
STI->isNeonAvailable()) {
1898-
// Convert H/S register to corresponding D register
1899-
if (AArch64::H0 <= DestReg && DestReg <= AArch64::H31)
1900-
DestReg = AArch64::D0 + (DestReg - AArch64::H0);
1901-
else if (AArch64::S0 <= DestReg && DestReg <= AArch64::S31)
1902-
DestReg = AArch64::D0 + (DestReg - AArch64::S0);
1903-
else
1904-
assert(AArch64::D0 <= DestReg && DestReg <= AArch64::D31);
1897+
if (!STI->hasZeroCycleZeroingFPWorkaround() && STI->isNeonAvailable()) {
1898+
if (STI->hasZeroCycleZeroingFPR64()) {
1899+
// Convert H/S register to corresponding D register
1900+
const AArch64RegisterInfo *TRI = STI->getRegisterInfo();
1901+
if (AArch64::FPR16RegClass.contains(DestReg))
1902+
DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::hsub,
1903+
&AArch64::FPR64RegClass);
1904+
else if (AArch64::FPR32RegClass.contains(DestReg))
1905+
DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::ssub,
1906+
&AArch64::FPR64RegClass);
1907+
else
1908+
assert(AArch64::FPR64RegClass.contains(DestReg));
1909+
1910+
MCInst MOVI;
1911+
MOVI.setOpcode(AArch64::MOVID);
1912+
MOVI.addOperand(MCOperand::createReg(DestReg));
1913+
MOVI.addOperand(MCOperand::createImm(0));
1914+
EmitToStreamer(*OutStreamer, MOVI);
1915+
} else if (STI->hasZeroCycleZeroingFPR128()) {
1916+
// Convert H/S/D register to corresponding Q register
1917+
const AArch64RegisterInfo *TRI = STI->getRegisterInfo();
1918+
if (AArch64::FPR16RegClass.contains(DestReg)) {
1919+
DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::hsub,
1920+
&AArch64::FPR128RegClass);
1921+
} else if (AArch64::FPR32RegClass.contains(DestReg)) {
1922+
DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::ssub,
1923+
&AArch64::FPR128RegClass);
1924+
} else {
1925+
assert(AArch64::FPR64RegClass.contains(DestReg));
1926+
DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::dsub,
1927+
&AArch64::FPR128RegClass);
1928+
}
19051929

1906-
MCInst MOVI;
1907-
MOVI.setOpcode(AArch64::MOVID);
1908-
MOVI.addOperand(MCOperand::createReg(DestReg));
1909-
MOVI.addOperand(MCOperand::createImm(0));
1910-
EmitToStreamer(*OutStreamer, MOVI);
1911-
} else {
1912-
MCInst FMov;
1913-
switch (MI.getOpcode()) {
1914-
default: llvm_unreachable("Unexpected opcode");
1915-
case AArch64::FMOVH0:
1916-
FMov.setOpcode(STI->hasFullFP16() ? AArch64::FMOVWHr : AArch64::FMOVWSr);
1917-
if (!STI->hasFullFP16())
1918-
DestReg = (AArch64::S0 + (DestReg - AArch64::H0));
1919-
FMov.addOperand(MCOperand::createReg(DestReg));
1920-
FMov.addOperand(MCOperand::createReg(AArch64::WZR));
1921-
break;
1922-
case AArch64::FMOVS0:
1923-
FMov.setOpcode(AArch64::FMOVWSr);
1924-
FMov.addOperand(MCOperand::createReg(DestReg));
1925-
FMov.addOperand(MCOperand::createReg(AArch64::WZR));
1926-
break;
1927-
case AArch64::FMOVD0:
1928-
FMov.setOpcode(AArch64::FMOVXDr);
1929-
FMov.addOperand(MCOperand::createReg(DestReg));
1930-
FMov.addOperand(MCOperand::createReg(AArch64::XZR));
1931-
break;
1930+
MCInst MOVI;
1931+
MOVI.setOpcode(AArch64::MOVIv2d_ns);
1932+
MOVI.addOperand(MCOperand::createReg(DestReg));
1933+
MOVI.addOperand(MCOperand::createImm(0));
1934+
EmitToStreamer(*OutStreamer, MOVI);
1935+
} else {
1936+
emitFMov0AsFMov(MI, DestReg);
19321937
}
1933-
EmitToStreamer(*OutStreamer, FMov);
1938+
} else {
1939+
emitFMov0AsFMov(MI, DestReg);
1940+
}
1941+
}
1942+
1943+
void AArch64AsmPrinter::emitFMov0AsFMov(const MachineInstr &MI,
1944+
Register DestReg) {
1945+
MCInst FMov;
1946+
switch (MI.getOpcode()) {
1947+
default:
1948+
llvm_unreachable("Unexpected opcode");
1949+
case AArch64::FMOVH0:
1950+
FMov.setOpcode(STI->hasFullFP16() ? AArch64::FMOVWHr : AArch64::FMOVWSr);
1951+
if (!STI->hasFullFP16())
1952+
DestReg = (AArch64::S0 + (DestReg - AArch64::H0));
1953+
FMov.addOperand(MCOperand::createReg(DestReg));
1954+
FMov.addOperand(MCOperand::createReg(AArch64::WZR));
1955+
break;
1956+
case AArch64::FMOVS0:
1957+
FMov.setOpcode(AArch64::FMOVWSr);
1958+
FMov.addOperand(MCOperand::createReg(DestReg));
1959+
FMov.addOperand(MCOperand::createReg(AArch64::WZR));
1960+
break;
1961+
case AArch64::FMOVD0:
1962+
FMov.setOpcode(AArch64::FMOVXDr);
1963+
FMov.addOperand(MCOperand::createReg(DestReg));
1964+
FMov.addOperand(MCOperand::createReg(AArch64::XZR));
1965+
break;
19341966
}
1967+
EmitToStreamer(*OutStreamer, FMov);
19351968
}
19361969

19371970
Register AArch64AsmPrinter::emitPtrauthDiscriminator(uint16_t Disc,
@@ -2952,7 +2985,7 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
29522985
MCInst TmpInst;
29532986
TmpInst.setOpcode(AArch64::MOVIv16b_ns);
29542987
TmpInst.addOperand(MCOperand::createReg(MI->getOperand(0).getReg()));
2955-
TmpInst.addOperand(MCOperand::createImm(MI->getOperand(1).getImm()));
2988+
TmpInst.addOperand(MCOperand::createImm(0));
29562989
EmitToStreamer(*OutStreamer, TmpInst);
29572990
return;
29582991
}

llvm/lib/Target/AArch64/AArch64Features.td

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -621,25 +621,30 @@ def FeatureZCRegMoveGPR64 : SubtargetFeature<"zcm-gpr64", "HasZeroCycleRegMoveGP
621621
def FeatureZCRegMoveGPR32 : SubtargetFeature<"zcm-gpr32", "HasZeroCycleRegMoveGPR32", "true",
622622
"Has zero-cycle register moves for GPR32 registers">;
623623

624+
def FeatureZCRegMoveFPR128 : SubtargetFeature<"zcm-fpr128", "HasZeroCycleRegMoveFPR128", "true",
625+
"Has zero-cycle register moves for FPR128 registers">;
626+
624627
def FeatureZCRegMoveFPR64 : SubtargetFeature<"zcm-fpr64", "HasZeroCycleRegMoveFPR64", "true",
625628
"Has zero-cycle register moves for FPR64 registers">;
626629

627630
def FeatureZCRegMoveFPR32 : SubtargetFeature<"zcm-fpr32", "HasZeroCycleRegMoveFPR32", "true",
628631
"Has zero-cycle register moves for FPR32 registers">;
629632

630-
def FeatureZCZeroingGP : SubtargetFeature<"zcz-gp", "HasZeroCycleZeroingGP", "true",
631-
"Has zero-cycle zeroing instructions for generic registers">;
633+
def FeatureZCZeroingGPR64 : SubtargetFeature<"zcz-gpr64", "HasZeroCycleZeroingGPR64", "true",
634+
"Has zero-cycle zeroing instructions for GPR64 registers">;
635+
636+
def FeatureZCZeroingGPR32 : SubtargetFeature<"zcz-gpr32", "HasZeroCycleZeroingGPR32", "true",
637+
"Has zero-cycle zeroing instructions for GPR32 registers">;
638+
639+
def FeatureZCZeroingFPR128 : SubtargetFeature<"zcz-fpr128", "HasZeroCycleZeroingFPR128", "true",
640+
"Has zero-cycle zeroing instructions for FPR128 registers">;
632641

633642
// It is generally beneficial to rewrite "fmov s0, wzr" to "movi d0, #0".
634643
// as movi is more efficient across all cores. Newer cores can eliminate
635644
// fmovs early and there is no difference with movi, but this not true for
636645
// all implementations.
637-
def FeatureNoZCZeroingFP : SubtargetFeature<"no-zcz-fp", "HasZeroCycleZeroingFP", "false",
638-
"Has no zero-cycle zeroing instructions for FP registers">;
639-
640-
def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true",
641-
"Has zero-cycle zeroing instructions",
642-
[FeatureZCZeroingGP]>;
646+
def FeatureNoZCZeroingFPR64 : SubtargetFeature<"no-zcz-fpr64", "HasZeroCycleZeroingFPR64", "false",
647+
"Has no zero-cycle zeroing instructions for FPR64 registers">;
643648

644649
/// ... but the floating-point version doesn't quite work in rare cases on older
645650
/// CPUs.

0 commit comments

Comments
 (0)