[AArch64] Consider COPY between disjoint register classes as expensive

guy-david · guy-david · commit 3f257e596f30 · 2025-11-13T00:25:40.000+02:00
The motivation is to allow passes such as MachineLICM to hoist trivial
FMOV instructions out of loops, where previously it didn't do so even
when the RHS is a constant.
On most architectures, these expensive move instructions have a latency
of 2-6 cycles, and certainly not cheap as a 0-1 cycle move.
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -1043,6 +1043,28 @@ static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize) {
   return Is.size() <= 2;
 }
 
+// Check if a COPY instruction is cheap.
+static bool isCheapCopy(const MachineInstr &MI, const AArch64RegisterInfo &RI) {
+  assert(MI.isCopy() && "Expected COPY instruction");
+  const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
+
+  // Cross-register-class copies (e.g., between GPR and FPR) are expensive on
+  // AArch64, typically requiring an FMOV instruction with a 2-6 cycle latency.
+  auto GetRegClass = [&](Register Reg) -> const TargetRegisterClass * {
+    if (Reg.isVirtual())
+      return MRI.getRegClass(Reg);
+    if (Reg.isPhysical())
+      return RI.getMinimalPhysRegClass(Reg);
+    return nullptr;
+  };
+  const TargetRegisterClass *DstRC = GetRegClass(MI.getOperand(0).getReg());
+  const TargetRegisterClass *SrcRC = GetRegClass(MI.getOperand(1).getReg());
+  if (DstRC && SrcRC && !RI.getCommonSubClass(DstRC, SrcRC))
+    return false;
+
+  return MI.isAsCheapAsAMove();
+}
+
 // FIXME: this implementation should be micro-architecture dependent, so a
 // micro-architecture target hook should be introduced here in future.
 bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
@@ -1056,6 +1078,9 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
   default:
     return MI.isAsCheapAsAMove();
 
+  case TargetOpcode::COPY:
+    return isCheapCopy(MI, RI);
+
   case AArch64::ADDWrs:
   case AArch64::ADDXrs:
   case AArch64::SUBWrs:
diff --git a/llvm/test/CodeGen/AArch64/licm-regclass-copy.mir b/llvm/test/CodeGen/AArch64/licm-regclass-copy.mir
@@ -0,0 +1,76 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=aarch64 -run-pass=early-machinelicm -o - %s | FileCheck %s
+
+# This test verifies that cross-register-class copies (e.g., GPR to FPR)
+# are hoisted out of loops by MachineLICM, as they are expensive on AArch64.
+
+--- |
+  declare void @use_float(float)
+
+  define void @cross_regclass_copy_hoisted() {
+    ret void
+  }
+...
+---
+name: cross_regclass_copy_hoisted
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: cross_regclass_copy_hoisted
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $w0, $w1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr32 = COPY $w0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr32 = COPY $w1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr32all = COPY $wzr
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gpr32all = COPY [[COPY2]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:fpr32 = COPY [[COPY1]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:gpr32common = PHI [[COPY3]], %bb.0, %5, %bb.2
+  ; CHECK-NEXT:   [[SUBSWrr:%[0-9]+]]:gpr32 = SUBSWrr [[PHI]], [[COPY]], implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 1, %bb.3, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-NEXT:   $s0 = COPY [[COPY4]]
+  ; CHECK-NEXT:   BL @use_float, implicit-def dead $lr, implicit $sp, implicit $s0, implicit-def $sp
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-NEXT:   [[ADDWri:%[0-9]+]]:gpr32sp = ADDWri [[PHI]], 1, 0
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:gpr32all = COPY [[ADDWri]]
+  ; CHECK-NEXT:   B %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   RET_ReallyLR
+  bb.0:
+    liveins: $w0, $w1
+    %1:gpr32 = COPY $w0
+    %0:gpr32 = COPY $w1
+    %3:gpr32all = COPY $wzr
+    %2:gpr32all = COPY %3:gpr32all
+
+  bb.1:
+    %4:gpr32common = PHI %2:gpr32all, %bb.0, %5:gpr32all, %bb.2
+    %6:gpr32 = SUBSWrr %4:gpr32common, %1:gpr32, implicit-def $nzcv
+    Bcc 1, %bb.3, implicit $nzcv
+    B %bb.2
+
+  bb.2:
+    %7:fpr32 = COPY %0:gpr32
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    $s0 = COPY %7:fpr32
+    BL @use_float, implicit-def dead $lr, implicit $sp, implicit $s0, implicit-def $sp
+    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+    %8:gpr32sp = ADDWri %4:gpr32common, 1, 0
+    %5:gpr32all = COPY %8:gpr32sp
+    B %bb.1
+
+  bb.3:
+    RET_ReallyLR
+
+...