Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 69 additions & 36 deletions llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,7 @@ class AArch64AsmPrinter : public AsmPrinter {

/// Emit instruction to set float register to zero.
void emitFMov0(const MachineInstr &MI);
void emitFMov0AsFMov(const MachineInstr &MI, Register DestReg);

using MInstToMCSymbol = std::map<const MachineInstr *, MCSymbol *>;

Expand Down Expand Up @@ -1829,45 +1830,77 @@ void AArch64AsmPrinter::emitMOVK(Register Dest, uint64_t Imm, unsigned Shift) {

void AArch64AsmPrinter::emitFMov0(const MachineInstr &MI) {
Register DestReg = MI.getOperand(0).getReg();
if (STI->hasZeroCycleZeroingFPR64() &&
!STI->hasZeroCycleZeroingFPWorkaround() && STI->isNeonAvailable()) {
// Convert H/S register to corresponding D register
if (AArch64::H0 <= DestReg && DestReg <= AArch64::H31)
DestReg = AArch64::D0 + (DestReg - AArch64::H0);
else if (AArch64::S0 <= DestReg && DestReg <= AArch64::S31)
DestReg = AArch64::D0 + (DestReg - AArch64::S0);
else
assert(AArch64::D0 <= DestReg && DestReg <= AArch64::D31);
if (!STI->hasZeroCycleZeroingFPWorkaround() && STI->isNeonAvailable()) {
if (STI->hasZeroCycleZeroingFPR64()) {
// Convert H/S register to corresponding D register
const AArch64RegisterInfo *TRI = STI->getRegisterInfo();
if (AArch64::FPR16RegClass.contains(DestReg))
DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::hsub,
&AArch64::FPR64RegClass);
else if (AArch64::FPR32RegClass.contains(DestReg))
DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::ssub,
&AArch64::FPR64RegClass);
else
assert(AArch64::FPR64RegClass.contains(DestReg));

MCInst MOVI;
MOVI.setOpcode(AArch64::MOVID);
MOVI.addOperand(MCOperand::createReg(DestReg));
MOVI.addOperand(MCOperand::createImm(0));
EmitToStreamer(*OutStreamer, MOVI);
} else if (STI->hasZeroCycleZeroingFPR128()) {
// Convert H/S/D register to corresponding Q register
const AArch64RegisterInfo *TRI = STI->getRegisterInfo();
if (AArch64::FPR16RegClass.contains(DestReg)) {
DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::hsub,
&AArch64::FPR128RegClass);
} else if (AArch64::FPR32RegClass.contains(DestReg)) {
DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::ssub,
&AArch64::FPR128RegClass);
} else {
assert(AArch64::FPR64RegClass.contains(DestReg));
DestReg = TRI->getMatchingSuperReg(DestReg, AArch64::dsub,
&AArch64::FPR128RegClass);
}

MCInst MOVI;
MOVI.setOpcode(AArch64::MOVID);
MOVI.addOperand(MCOperand::createReg(DestReg));
MOVI.addOperand(MCOperand::createImm(0));
EmitToStreamer(*OutStreamer, MOVI);
} else {
MCInst FMov;
switch (MI.getOpcode()) {
default: llvm_unreachable("Unexpected opcode");
case AArch64::FMOVH0:
FMov.setOpcode(STI->hasFullFP16() ? AArch64::FMOVWHr : AArch64::FMOVWSr);
if (!STI->hasFullFP16())
DestReg = (AArch64::S0 + (DestReg - AArch64::H0));
FMov.addOperand(MCOperand::createReg(DestReg));
FMov.addOperand(MCOperand::createReg(AArch64::WZR));
break;
case AArch64::FMOVS0:
FMov.setOpcode(AArch64::FMOVWSr);
FMov.addOperand(MCOperand::createReg(DestReg));
FMov.addOperand(MCOperand::createReg(AArch64::WZR));
break;
case AArch64::FMOVD0:
FMov.setOpcode(AArch64::FMOVXDr);
FMov.addOperand(MCOperand::createReg(DestReg));
FMov.addOperand(MCOperand::createReg(AArch64::XZR));
break;
MCInst MOVI;
MOVI.setOpcode(AArch64::MOVIv2d_ns);
MOVI.addOperand(MCOperand::createReg(DestReg));
MOVI.addOperand(MCOperand::createImm(0));
EmitToStreamer(*OutStreamer, MOVI);
} else {
emitFMov0AsFMov(MI, DestReg);
}
EmitToStreamer(*OutStreamer, FMov);
} else {
emitFMov0AsFMov(MI, DestReg);
}
}

void AArch64AsmPrinter::emitFMov0AsFMov(const MachineInstr &MI,
Register DestReg) {
MCInst FMov;
switch (MI.getOpcode()) {
default:
llvm_unreachable("Unexpected opcode");
case AArch64::FMOVH0:
FMov.setOpcode(STI->hasFullFP16() ? AArch64::FMOVWHr : AArch64::FMOVWSr);
if (!STI->hasFullFP16())
DestReg = (AArch64::S0 + (DestReg - AArch64::H0));
FMov.addOperand(MCOperand::createReg(DestReg));
FMov.addOperand(MCOperand::createReg(AArch64::WZR));
break;
case AArch64::FMOVS0:
FMov.setOpcode(AArch64::FMOVWSr);
FMov.addOperand(MCOperand::createReg(DestReg));
FMov.addOperand(MCOperand::createReg(AArch64::WZR));
break;
case AArch64::FMOVD0:
FMov.setOpcode(AArch64::FMOVXDr);
FMov.addOperand(MCOperand::createReg(DestReg));
FMov.addOperand(MCOperand::createReg(AArch64::XZR));
break;
}
EmitToStreamer(*OutStreamer, FMov);
}

Register AArch64AsmPrinter::emitPtrauthDiscriminator(uint16_t Disc,
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AArch64/AArch64Features.td
Original file line number Diff line number Diff line change
Expand Up @@ -636,6 +636,9 @@ def FeatureZCZeroingGPR64 : SubtargetFeature<"zcz-gpr64", "HasZeroCycleZeroingGP
def FeatureZCZeroingGPR32 : SubtargetFeature<"zcz-gpr32", "HasZeroCycleZeroingGPR32", "true",
"Has zero-cycle zeroing instructions for GPR32 registers">;

def FeatureZCZeroingFPR128 : SubtargetFeature<"zcz-fpr128", "HasZeroCycleZeroingFPR128", "true",
"Has zero-cycle zeroing instructions for FPR128 registers">;

// It is generally beneficial to rewrite "fmov s0, wzr" to "movi d0, #0".
// as movi is more efficient across all cores. Newer cores can eliminate
// fmovs early and there is no difference with movi, but this not true for
Expand Down
16 changes: 12 additions & 4 deletions llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5471,8 +5471,12 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
// Copies between GPR64 and FPR64.
if (AArch64::FPR64RegClass.contains(DestReg) &&
AArch64::GPR64RegClass.contains(SrcReg)) {
BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
if (AArch64::XZR == SrcReg) {
BuildMI(MBB, I, DL, get(AArch64::FMOVD0), DestReg);
} else {
BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
}
return;
}
if (AArch64::GPR64RegClass.contains(DestReg) &&
Expand All @@ -5484,8 +5488,12 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
// Copies between GPR32 and FPR32.
if (AArch64::FPR32RegClass.contains(DestReg) &&
AArch64::GPR32RegClass.contains(SrcReg)) {
BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
if (AArch64::WZR == SrcReg) {
BuildMI(MBB, I, DL, get(AArch64::FMOVS0), DestReg);
} else {
BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
}
return;
}
if (AArch64::GPR32RegClass.contains(DestReg) &&
Expand Down
38 changes: 29 additions & 9 deletions llvm/lib/Target/AArch64/AArch64Processors.td
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,8 @@ def TuneAppleA7 : SubtargetFeature<"apple-a7", "ARMProcFamily", "AppleA7",
FeatureZCRegMoveFPR128,
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64,
FeatureNoZCZeroingFPR64,
FeatureZCZeroingFPR128,
FeatureZCZeroingFPWorkaround]>;

def TuneAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10",
Expand All @@ -358,7 +360,9 @@ def TuneAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10",
FeatureZCRegMoveGPR64,
FeatureZCRegMoveFPR128,
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64]>;
FeatureZCZeroingGPR64,
FeatureNoZCZeroingFPR64,
FeatureZCZeroingFPR128]>;

def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11",
"Apple A11", [
Expand All @@ -372,7 +376,9 @@ def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11",
FeatureZCRegMoveGPR64,
FeatureZCRegMoveFPR128,
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64]>;
FeatureZCZeroingGPR64,
FeatureNoZCZeroingFPR64,
FeatureZCZeroingFPR128]>;

def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12",
"Apple A12", [
Expand All @@ -386,7 +392,9 @@ def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12",
FeatureZCRegMoveGPR64,
FeatureZCRegMoveFPR128,
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64]>;
FeatureZCZeroingGPR64,
FeatureNoZCZeroingFPR64,
FeatureZCZeroingFPR128]>;

def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13",
"Apple A13", [
Expand All @@ -400,7 +408,9 @@ def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13",
FeatureZCRegMoveGPR64,
FeatureZCRegMoveFPR128,
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64]>;
FeatureZCZeroingGPR64,
FeatureNoZCZeroingFPR64,
FeatureZCZeroingFPR128]>;

def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
"Apple A14", [
Expand All @@ -419,7 +429,9 @@ def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
FeatureZCRegMoveGPR64,
FeatureZCRegMoveFPR128,
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64]>;
FeatureZCZeroingGPR64,
FeatureNoZCZeroingFPR64,
FeatureZCZeroingFPR128]>;

def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
"Apple A15", [
Expand All @@ -438,7 +450,9 @@ def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
FeatureZCRegMoveGPR64,
FeatureZCRegMoveFPR128,
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64]>;
FeatureZCZeroingGPR64,
FeatureNoZCZeroingFPR64,
FeatureZCZeroingFPR128]>;

def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
"Apple A16", [
Expand All @@ -457,7 +471,9 @@ def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
FeatureZCRegMoveGPR64,
FeatureZCRegMoveFPR128,
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64]>;
FeatureZCZeroingGPR64,
FeatureNoZCZeroingFPR64,
FeatureZCZeroingFPR128]>;

def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
"Apple A17", [
Expand All @@ -476,7 +492,9 @@ def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
FeatureZCRegMoveGPR64,
FeatureZCRegMoveFPR128,
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64]>;
FeatureZCZeroingGPR64,
FeatureNoZCZeroingFPR64,
FeatureZCZeroingFPR128]>;

def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
"Apple M4", [
Expand All @@ -494,7 +512,9 @@ def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
FeatureZCRegMoveGPR64,
FeatureZCRegMoveFPR128,
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64]>;
FeatureZCZeroingGPR64,
FeatureNoZCZeroingFPR64,
FeatureZCZeroingFPR128]>;

def TuneExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
"Samsung Exynos-M3 processors",
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
Original file line number Diff line number Diff line change
Expand Up @@ -69,14 +69,14 @@ define double @add_sub_su64(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
; CHECK-LABEL: add_sub_su64:
; CHECK: // %bb.0:
; CHECK-NEXT: add d0, d1, d0
; CHECK-NEXT: fmov d1, xzr
; CHECK-NEXT: movi d1, #0000000000000000
; CHECK-NEXT: sub d0, d1, d0
; CHECK-NEXT: ret
;
; GENERIC-LABEL: add_sub_su64:
; GENERIC: // %bb.0:
; GENERIC-NEXT: add d0, d1, d0
; GENERIC-NEXT: fmov d1, xzr
; GENERIC-NEXT: movi d1, #0000000000000000
; GENERIC-NEXT: sub d0, d1, d0
; GENERIC-NEXT: ret
%vecext = extractelement <2 x i64> %a, i32 0
Expand Down
Loading