[AArch64] Refactor AArch64InstrInfo::isAsCheapAsAMove (NFC)

momchil-velikov · momchil-velikov · commit ededcb004117 · 2023-09-21T18:30:01.000+01:00
- remove `FeatureCustomCheapAsMoveHandling`: when you have target features affecting `isAsCheapAsAMove` that can be given on command line or passed via attributes, then every sub-target effectively has custom handling - remove special handling of `FMOVD0`/etc: `FVMOV` with an immediate zero operand is never[1] more expensive tha an `FMOV` with a register operand. - remove special handling of `COPY` - copy is trivially as cheap as itself - make the function default to the `MachineInstr` attribute `isAsCheapAsAMove` - remove special handling of `ANDWrr`/etc and of `ANDWri`/etc: the fallback `MachineInstr` attribute is already non-zero. - remove special handling of `ADDWri`/`SUBWri`/`ADDXri`/`SUBXri` - there are always[1] one cycle latency with maximum (for the micro-architecture) throughput - check if `MOVi32Imm`/`MOVi64Imm` can be expanded into a "cheap" sequence of instructions There is a little twist with determining whether a MOVi32Imm`/`MOVi64Imm` is "as-cheap-as-a-move". Even if one of these pseudo-instructions needs to be expanded to more than one MOVZ, MOVN, or MOVK instructions, materialisation may be preferrable to allocating a register to hold the constant. For the moment a cutoff at two instructions seems like a reasonable compromise. [1] according to 19 software optimisation manuals Reviewed By: dmgreen Differential Revision: https://reviews.llvm.org/D154722
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
@@ -223,14 +223,9 @@ def FeatureEnableSelectOptimize : SubtargetFeature<
     "enable-select-opt", "EnableSelectOptimize", "true",
     "Enable the select optimize pass for select loop heuristics">;
 
-def FeatureCustomCheapAsMoveHandling : SubtargetFeature<"custom-cheap-as-move",
-    "HasCustomCheapAsMoveHandling", "true",
-    "Use custom handling of cheap instructions">;
-
 def FeatureExynosCheapAsMoveHandling : SubtargetFeature<"exynos-cheap-as-move",
     "HasExynosCheapAsMoveHandling", "true",
-    "Use Exynos specific handling of cheap instructions",
-    [FeatureCustomCheapAsMoveHandling]>;
+    "Use Exynos specific handling of cheap instructions">;
 
 def FeaturePostRAScheduler : SubtargetFeature<"use-postra-scheduler",
     "UsePostRAScheduler", "true", "Schedule again after register allocation">;
@@ -794,7 +789,6 @@ def TuneA53     : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
                                    FeatureBalanceFPOps,
-                                   FeatureCustomCheapAsMoveHandling,
                                    FeaturePostRAScheduler]>;
 
 def TuneA55     : SubtargetFeature<"a55", "ARMProcFamily", "CortexA55",
@@ -815,7 +809,6 @@ def TuneA57     : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
                                    "Cortex-A57 ARM processors", [
                                    FeatureFuseAES,
                                    FeatureBalanceFPOps,
-                                   FeatureCustomCheapAsMoveHandling,
                                    FeatureFuseAdrpAdd,
                                    FeatureFuseLiterals,
                                    FeaturePostRAScheduler,
@@ -1110,7 +1103,6 @@ def TuneExynosM4 : SubtargetFeature<"exynosm4", "ARMProcFamily", "ExynosM3",
 
 def TuneKryo    : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
                                    "Qualcomm Kryo processors", [
-                                   FeatureCustomCheapAsMoveHandling,
                                    FeaturePostRAScheduler,
                                    FeaturePredictableSelectIsExpensive,
                                    FeatureZCZeroing,
@@ -1120,7 +1112,6 @@ def TuneKryo    : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
 
 def TuneFalkor  : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
                                    "Qualcomm Falkor processors", [
-                                   FeatureCustomCheapAsMoveHandling,
                                    FeaturePostRAScheduler,
                                    FeaturePredictableSelectIsExpensive,
                                    FeatureZCZeroing,
@@ -1188,7 +1179,6 @@ def TuneNeoverseV2 : SubtargetFeature<"neoversev2", "ARMProcFamily", "NeoverseV2
 
 def TuneSaphira  : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira",
                                    "Qualcomm Saphira processors", [
-                                   FeatureCustomCheapAsMoveHandling,
                                    FeaturePostRAScheduler,
                                    FeaturePredictableSelectIsExpensive,
                                    FeatureZCZeroing,
@@ -1237,7 +1227,6 @@ def TuneThunderXT83 : SubtargetFeature<"thunderxt83", "ARMProcFamily",
 
 def TuneTSV110 : SubtargetFeature<"tsv110", "ARMProcFamily", "TSV110",
                                   "HiSilicon TS-V110 processors", [
-                                  FeatureCustomCheapAsMoveHandling,
                                   FeatureFuseAES,
                                   FeaturePostRAScheduler]>;
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -878,93 +878,40 @@ void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
       .addImm(CC);
 }
 
-/// Returns true if a MOVi32imm or MOVi64imm can be expanded to an  ORRxx.
-static bool canBeExpandedToORR(const MachineInstr &MI, unsigned BitSize) {
-  uint64_t Imm = MI.getOperand(1).getImm();
-  uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
-  uint64_t Encoding;
-  return AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding);
+// Return true if Imm can be loaded into a register by a "cheap" sequence of
+// instructions. For now, "cheap" means at most two instructions.
+static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize) {
+  if (BitSize == 32)
+    return true;
+
+  assert(BitSize == 64 && "Only bit sizes of 32 or 64 allowed");
+  uint64_t Imm = static_cast<uint64_t>(MI.getOperand(1).getImm());
+  SmallVector<AArch64_IMM::ImmInsnModel, 4> Is;
+  AArch64_IMM::expandMOVImm(Imm, BitSize, Is);
+
+  return Is.size() <= 2;
 }
 
 // FIXME: this implementation should be micro-architecture dependent, so a
 // micro-architecture target hook should be introduced here in future.
 bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
-  if (!Subtarget.hasCustomCheapAsMoveHandling())
-    return MI.isAsCheapAsAMove();
-
-  const unsigned Opcode = MI.getOpcode();
-
-  // Firstly, check cases gated by features.
-
-  if (Subtarget.hasZeroCycleZeroingFP()) {
-    if (Opcode == AArch64::FMOVH0 ||
-        Opcode == AArch64::FMOVS0 ||
-        Opcode == AArch64::FMOVD0)
-      return true;
-  }
-
-  if (Subtarget.hasZeroCycleZeroingGP()) {
-    if (Opcode == TargetOpcode::COPY &&
-        (MI.getOperand(1).getReg() == AArch64::WZR ||
-         MI.getOperand(1).getReg() == AArch64::XZR))
-      return true;
-  }
-
-  // Secondly, check cases specific to sub-targets.
-
   if (Subtarget.hasExynosCheapAsMoveHandling()) {
     if (isExynosCheapAsMove(MI))
       return true;
-
     return MI.isAsCheapAsAMove();
   }
 
-  // Finally, check generic cases.
-
-  switch (Opcode) {
+  switch (MI.getOpcode()) {
   default:
-    return false;
-
-  // add/sub on register without shift
-  case AArch64::ADDWri:
-  case AArch64::ADDXri:
-  case AArch64::SUBWri:
-  case AArch64::SUBXri:
-    return (MI.getOperand(3).getImm() == 0);
-
-  // logical ops on immediate
-  case AArch64::ANDWri:
-  case AArch64::ANDXri:
-  case AArch64::EORWri:
-  case AArch64::EORXri:
-  case AArch64::ORRWri:
-  case AArch64::ORRXri:
-    return true;
-
-  // logical ops on register without shift
-  case AArch64::ANDWrr:
-  case AArch64::ANDXrr:
-  case AArch64::BICWrr:
-  case AArch64::BICXrr:
-  case AArch64::EONWrr:
-  case AArch64::EONXrr:
-  case AArch64::EORWrr:
-  case AArch64::EORXrr:
-  case AArch64::ORNWrr:
-  case AArch64::ORNXrr:
-  case AArch64::ORRWrr:
-  case AArch64::ORRXrr:
-    return true;
-
+    return MI.isAsCheapAsAMove();
   // If MOVi32imm or MOVi64imm can be expanded into ORRWri or
-  // ORRXri, it is as cheap as MOV
+  // ORRXri, it is as cheap as MOV.
+  // Likewise if it can be expanded to MOVZ/MOVN/MOVK.
   case AArch64::MOVi32imm:
-    return canBeExpandedToORR(MI, 32);
+    return isCheapImmediate(MI, 32);
   case AArch64::MOVi64imm:
-    return canBeExpandedToORR(MI, 64);
+    return isCheapImmediate(MI, 64);
   }
-
-  llvm_unreachable("Unknown opcode to check as cheap as a move!");
 }
 
 bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) {
diff --git a/llvm/test/CodeGen/AArch64/cheap-as-a-move.ll b/llvm/test/CodeGen/AArch64/cheap-as-a-move.ll
@@ -0,0 +1,117 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux"
+
+; Check an "expensive" construction of a constant is hoisted out of a loop
+define void @f0(ptr %a, i64 %n) {
+; CHECK-LABEL: f0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w21, -24
+; CHECK-NEXT:    .cfi_offset w22, -32
+; CHECK-NEXT:    .cfi_offset w23, -40
+; CHECK-NEXT:    .cfi_offset w30, -48
+; CHECK-NEXT:    mov x21, #1 // =0x1
+; CHECK-NEXT:    mov x19, x1
+; CHECK-NEXT:    mov x20, x0
+; CHECK-NEXT:    movk x21, #22136, lsl #16
+; CHECK-NEXT:    mov x22, xzr
+; CHECK-NEXT:    movk x21, #4660, lsl #48
+; CHECK-NEXT:    cmp x22, x19
+; CHECK-NEXT:    b.ge .LBB0_2
+; CHECK-NEXT:  .LBB0_1: // %loop.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    lsl x23, x22, #2
+; CHECK-NEXT:    mov x1, x21
+; CHECK-NEXT:    ldr w0, [x20, x23]
+; CHECK-NEXT:    bl g
+; CHECK-NEXT:    str w0, [x20, x23]
+; CHECK-NEXT:    add x22, x22, #1
+; CHECK-NEXT:    cmp x22, x19
+; CHECK-NEXT:    b.lt .LBB0_1
+; CHECK-NEXT:  .LBB0_2: // %exit
+; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  br label %loop
+
+loop:
+  %i = phi i64 [0, %entry], [%i.next, %loop.body]
+  %c = icmp slt i64 %i, %n
+  br i1 %c, label %loop.body, label %exit
+
+loop.body:
+  %p = getelementptr i32, ptr %a, i64 %i
+  %v = load i32, ptr %p
+  %w = call i32 @g(i32 %v, i64 1311673392922361857) ; 0x1234000056780001
+  store i32 %w, ptr %p
+  %i.next = add i64 %i, 1
+  br label %loop
+
+exit:
+  ret void
+}
+
+; Check a "cheap" to construct constant is materialised inside a loop.
+define void @f1(ptr %a, i64 %n) {
+; CHECK-LABEL: f1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
+; CHECK-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w21, -24
+; CHECK-NEXT:    .cfi_offset w22, -32
+; CHECK-NEXT:    .cfi_offset w30, -48
+; CHECK-NEXT:    mov x19, x1
+; CHECK-NEXT:    mov x20, x0
+; CHECK-NEXT:    mov x21, xzr
+; CHECK-NEXT:    cmp x21, x19
+; CHECK-NEXT:    b.ge .LBB1_2
+; CHECK-NEXT:  .LBB1_1: // %loop.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    lsl x22, x21, #2
+; CHECK-NEXT:    mov x1, #1450704896 // =0x56780000
+; CHECK-NEXT:    movk x1, #4660, lsl #48
+; CHECK-NEXT:    ldr w0, [x20, x22]
+; CHECK-NEXT:    bl g
+; CHECK-NEXT:    str w0, [x20, x22]
+; CHECK-NEXT:    add x21, x21, #1
+; CHECK-NEXT:    cmp x21, x19
+; CHECK-NEXT:    b.lt .LBB1_1
+; CHECK-NEXT:  .LBB1_2: // %exit
+; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  br label %loop
+
+loop:
+  %i = phi i64 [0, %entry], [%i.next, %loop.body]
+  %c = icmp slt i64 %i, %n
+  br i1 %c, label %loop.body, label %exit
+
+loop.body:
+  %p = getelementptr i32, ptr %a, i64 %i
+  %v = load i32, ptr %p
+  %w = call i32 @g(i32 %v, i64 1311673392922361856) ; 0x1234000056780000
+  store i32 %w, ptr %p
+  %i.next = add i64 %i, 1
+  br label %loop
+
+exit:
+  ret void
+}
+
+declare i32 @g(i32, i64)