diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index a46983fd37ea9..14f556c1f8c23 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -1007,6 +1007,28 @@ static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize) { return Is.size() <= 2; } +// Check if a COPY instruction is cheap. +static bool isCheapCopy(const MachineInstr &MI, const AArch64RegisterInfo &RI) { + assert(MI.isCopy() && "Expected COPY instruction"); + const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); + + // Cross-bank copies (e.g., between GPR and FPR) are expensive on AArch64, + // typically requiring an FMOV instruction with a 2-6 cycle latency. + auto GetRegClass = [&](Register Reg) -> const TargetRegisterClass * { + if (Reg.isVirtual()) + return MRI.getRegClass(Reg); + if (Reg.isPhysical()) + return RI.getMinimalPhysRegClass(Reg); + return nullptr; + }; + const TargetRegisterClass *DstRC = GetRegClass(MI.getOperand(0).getReg()); + const TargetRegisterClass *SrcRC = GetRegClass(MI.getOperand(1).getReg()); + if (DstRC && SrcRC && !RI.getCommonSubClass(DstRC, SrcRC)) + return false; + + return MI.isAsCheapAsAMove(); +} + // FIXME: this implementation should be micro-architecture dependent, so a // micro-architecture target hook should be introduced here in future. bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const { @@ -1020,6 +1042,9 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const { default: return MI.isAsCheapAsAMove(); + case TargetOpcode::COPY: + return isCheapCopy(MI, RI); + case AArch64::ADDWrs: case AArch64::ADDXrs: case AArch64::SUBWrs: diff --git a/llvm/test/CodeGen/AArch64/licm-regclass-copy.mir b/llvm/test/CodeGen/AArch64/licm-regclass-copy.mir new file mode 100644 index 0000000000000..6a10df68ddc71 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/licm-regclass-copy.mir @@ -0,0 +1,197 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=aarch64 -run-pass=early-machinelicm -o - %s | FileCheck %s + +# This test verifies that cross-bank copies (e.g., GPR to FPR, FPR to GPR) +# are hoisted out of loops by MachineLICM, as they are expensive on AArch64. + +--- | + declare void @use_float(float) + declare void @use_int(i32) + + define void @gpr_to_fpr_virtual_copy_hoisted() { + ret void + } + + define void @gpr_to_fpr_physical_copy_hoisted() { + ret void + } + + define void @fpr_to_gpr_virtual_copy_hoisted() { + ret void + } +... +--- +name: gpr_to_fpr_virtual_copy_hoisted +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: gpr_to_fpr_virtual_copy_hoisted + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $w0, $w1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr32 = COPY $w0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr32 = COPY $w1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr32all = COPY $wzr + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr32all = COPY [[COPY2]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:fpr32 = COPY [[COPY1]] + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI:%[0-9]+]]:gpr32common = PHI [[COPY3]], %bb.0, %5, %bb.2 + ; CHECK-NEXT: [[SUBSWrr:%[0-9]+]]:gpr32 = SUBSWrr [[PHI]], [[COPY]], implicit-def $nzcv + ; CHECK-NEXT: Bcc 1, %bb.3, implicit $nzcv + ; CHECK-NEXT: B %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $s0 = COPY [[COPY4]] + ; CHECK-NEXT: BL @use_float, implicit-def dead $lr, implicit $sp, implicit $s0, implicit-def $sp + ; CHECK-NEXT: [[ADDWri:%[0-9]+]]:gpr32sp = ADDWri [[PHI]], 1, 0 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr32all = COPY [[ADDWri]] + ; CHECK-NEXT: B %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: RET_ReallyLR + bb.0: + liveins: $w0, $w1 + %1:gpr32 = COPY $w0 + %0:gpr32 = COPY $w1 + %3:gpr32all = COPY $wzr + %2:gpr32all = COPY %3:gpr32all + + bb.1: + %4:gpr32common = PHI %2:gpr32all, %bb.0, %5:gpr32all, %bb.2 + %6:gpr32 = SUBSWrr %4:gpr32common, %1:gpr32, implicit-def $nzcv + Bcc 1, %bb.3, implicit $nzcv + B %bb.2 + + bb.2: + %7:fpr32 = COPY %0:gpr32 + $s0 = COPY %7:fpr32 + BL @use_float, implicit-def dead $lr, implicit $sp, implicit $s0, implicit-def $sp + %8:gpr32sp = ADDWri %4:gpr32common, 1, 0 + %5:gpr32all = COPY %8:gpr32sp + B %bb.1 + + bb.3: + RET_ReallyLR + +... +--- +name: gpr_to_fpr_physical_copy_hoisted +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: gpr_to_fpr_physical_copy_hoisted + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr32 = COPY $w0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr32all = COPY $wzr + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr32all = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:fpr32 = COPY $wzr + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI:%[0-9]+]]:gpr32common = PHI [[COPY2]], %bb.0, %4, %bb.2 + ; CHECK-NEXT: [[SUBSWrr:%[0-9]+]]:gpr32 = SUBSWrr [[PHI]], [[COPY]], implicit-def $nzcv + ; CHECK-NEXT: Bcc 1, %bb.3, implicit $nzcv + ; CHECK-NEXT: B %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $s0 = COPY [[COPY3]] + ; CHECK-NEXT: BL @use_float, implicit-def dead $lr, implicit $sp, implicit $s0, implicit-def $sp + ; CHECK-NEXT: [[ADDWri:%[0-9]+]]:gpr32sp = ADDWri [[PHI]], 1, 0 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr32all = COPY [[ADDWri]] + ; CHECK-NEXT: B %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: RET_ReallyLR + bb.0: + liveins: $w0 + %1:gpr32 = COPY $w0 + %3:gpr32all = COPY $wzr + %2:gpr32all = COPY %3:gpr32all + + bb.1: + %4:gpr32common = PHI %2:gpr32all, %bb.0, %5:gpr32all, %bb.2 + %6:gpr32 = SUBSWrr %4:gpr32common, %1:gpr32, implicit-def $nzcv + Bcc 1, %bb.3, implicit $nzcv + B %bb.2 + + bb.2: + %7:fpr32 = COPY $wzr + $s0 = COPY %7:fpr32 + BL @use_float, implicit-def dead $lr, implicit $sp, implicit $s0, implicit-def $sp + %8:gpr32sp = ADDWri %4:gpr32common, 1, 0 + %5:gpr32all = COPY %8:gpr32sp + B %bb.1 + + bb.3: + RET_ReallyLR + +... +--- +name: fpr_to_gpr_virtual_copy_hoisted +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: fpr_to_gpr_virtual_copy_hoisted + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $w0, $s0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr32 = COPY $w0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr32 = COPY $s0 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr32all = COPY $wzr + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr32all = COPY [[COPY2]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr32 = COPY [[COPY1]] + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI:%[0-9]+]]:gpr32common = PHI [[COPY3]], %bb.0, %5, %bb.2 + ; CHECK-NEXT: [[SUBSWrr:%[0-9]+]]:gpr32 = SUBSWrr [[PHI]], [[COPY]], implicit-def $nzcv + ; CHECK-NEXT: Bcc 1, %bb.3, implicit $nzcv + ; CHECK-NEXT: B %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $w0 = COPY [[COPY4]] + ; CHECK-NEXT: BL @use_int, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp + ; CHECK-NEXT: [[ADDWri:%[0-9]+]]:gpr32sp = ADDWri [[PHI]], 1, 0 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr32all = COPY [[ADDWri]] + ; CHECK-NEXT: B %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: RET_ReallyLR + bb.0: + liveins: $w0, $s0 + %1:gpr32 = COPY $w0 + %0:fpr32 = COPY $s0 + %3:gpr32all = COPY $wzr + %2:gpr32all = COPY %3:gpr32all + + bb.1: + %4:gpr32common = PHI %2:gpr32all, %bb.0, %5:gpr32all, %bb.2 + %6:gpr32 = SUBSWrr %4:gpr32common, %1:gpr32, implicit-def $nzcv + Bcc 1, %bb.3, implicit $nzcv + B %bb.2 + + bb.2: + %7:gpr32 = COPY %0:fpr32 + $w0 = COPY %7:gpr32 + BL @use_int, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp + %8:gpr32sp = ADDWri %4:gpr32common, 1, 0 + %5:gpr32all = COPY %8:gpr32sp + B %bb.1 + + bb.3: + RET_ReallyLR + +...