[AArch64] Correctly determine if {ADD,SUB}{W,X}rs instructions are cheap

momchil-velikov · momchil-velikov · commit 0eb0a65d0f9c · 2023-09-21T18:44:24.000+01:00
These are marked to be "as cheap as a move". According to publicly available Software Optimization Guides, they have one cycle latency and maximum throughput only on some microarchitectures, only for `LSL` and only for some shift amounts. This patch uses the subtarget feature `FeatureALULSLFast` to determine how cheap the instructions are. Reviewed By: dmgreen Differential Revision: https://reviews.llvm.org/D152827 Change-Id: I8f0d7e79bcf277ebf959719991c29a1bc7829486
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -904,6 +904,13 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
   switch (MI.getOpcode()) {
   default:
     return MI.isAsCheapAsAMove();
+
+  case AArch64::ADDWrs:
+  case AArch64::ADDXrs:
+  case AArch64::SUBWrs:
+  case AArch64::SUBXrs:
+    return Subtarget.hasALULSLFast() && MI.getOperand(3).getImm() <= 4;
+
   // If MOVi32imm or MOVi64imm can be expanded into ORRWri or
   // ORRXri, it is as cheap as MOV.
   // Likewise if it can be expanded to MOVZ/MOVN/MOVK.
diff --git a/llvm/test/CodeGen/AArch64/addsub-shifted-reg-cheap-as-move.ll b/llvm/test/CodeGen/AArch64/addsub-shifted-reg-cheap-as-move.ll
@@ -0,0 +1,132 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc                  < %s -o - | FileCheck %s
+; RUN: llc -mattr=+alu-lsl-fast < %s -o - | FileCheck %s -check-prefix=LSLFAST
+target triple = "aarch64-linux"
+
+declare void @g(...)
+
+; Check that ADDWrs/ADDXrs with shift > 4 is considered relatively
+; slow, thus CSE-d.
+define void @f0(i1 %c0, i1 %c1, ptr %a, i64 %i) {
+; CHECK-LABEL: f0:
+; CHECK:       // %bb.0: // %E
+; CHECK-NEXT:    tbz w0, #0, .LBB0_5
+; CHECK-NEXT:  // %bb.1: // %A
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    add x0, x2, x3, lsl #5
+; CHECK-NEXT:    tbz w1, #0, .LBB0_3
+; CHECK-NEXT:  // %bb.2: // %B
+; CHECK-NEXT:    bl g
+; CHECK-NEXT:    b .LBB0_4
+; CHECK-NEXT:  .LBB0_3: // %C
+; CHECK-NEXT:    mov x1, x0
+; CHECK-NEXT:    bl g
+; CHECK-NEXT:  .LBB0_4:
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:  .LBB0_5: // %X
+; CHECK-NEXT:    ret
+;
+; LSLFAST-LABEL: f0:
+; LSLFAST:       // %bb.0: // %E
+; LSLFAST-NEXT:    tbz w0, #0, .LBB0_5
+; LSLFAST-NEXT:  // %bb.1: // %A
+; LSLFAST-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; LSLFAST-NEXT:    .cfi_def_cfa_offset 16
+; LSLFAST-NEXT:    .cfi_offset w30, -16
+; LSLFAST-NEXT:    add x0, x2, x3, lsl #5
+; LSLFAST-NEXT:    tbz w1, #0, .LBB0_3
+; LSLFAST-NEXT:  // %bb.2: // %B
+; LSLFAST-NEXT:    bl g
+; LSLFAST-NEXT:    b .LBB0_4
+; LSLFAST-NEXT:  .LBB0_3: // %C
+; LSLFAST-NEXT:    mov x1, x0
+; LSLFAST-NEXT:    bl g
+; LSLFAST-NEXT:  .LBB0_4:
+; LSLFAST-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; LSLFAST-NEXT:  .LBB0_5: // %X
+; LSLFAST-NEXT:    ret
+E:
+    %p0 = getelementptr {i64, i64, i64, i64}, ptr %a, i64 %i
+    br i1 %c0, label %A, label %X
+
+A:
+    br i1 %c1, label %B, label %C
+
+B:
+    call void @g(ptr %p0)
+    br label %X
+
+C:
+    %p1 = getelementptr {i64, i64, i64, i64}, ptr %a, i64 %i
+    call void @g(ptr %p1, ptr %p0)
+    br label %X
+
+X:
+    ret void
+}
+
+; Check that ADDWrs/ADDXrs with shift <= 4 is considered relatively fast on sub-targets
+; with feature +alu-lsl-fast, thus *not* CSE-d.
+define void @f1(i1 %c0, i1 %c1, ptr %a, i64 %i) {
+; CHECK-LABEL: f1:
+; CHECK:       // %bb.0: // %E
+; CHECK-NEXT:    tbz w0, #0, .LBB1_5
+; CHECK-NEXT:  // %bb.1: // %A
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    add x0, x2, x3, lsl #4
+; CHECK-NEXT:    tbz w1, #0, .LBB1_3
+; CHECK-NEXT:  // %bb.2: // %B
+; CHECK-NEXT:    bl g
+; CHECK-NEXT:    b .LBB1_4
+; CHECK-NEXT:  .LBB1_3: // %C
+; CHECK-NEXT:    mov x1, x0
+; CHECK-NEXT:    bl g
+; CHECK-NEXT:  .LBB1_4:
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:  .LBB1_5: // %X
+; CHECK-NEXT:    ret
+;
+; LSLFAST-LABEL: f1:
+; LSLFAST:       // %bb.0: // %E
+; LSLFAST-NEXT:    tbz w0, #0, .LBB1_5
+; LSLFAST-NEXT:  // %bb.1: // %A
+; LSLFAST-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; LSLFAST-NEXT:    .cfi_def_cfa_offset 16
+; LSLFAST-NEXT:    .cfi_offset w30, -16
+; LSLFAST-NEXT:    add x8, x2, x3, lsl #4
+; LSLFAST-NEXT:    tbz w1, #0, .LBB1_3
+; LSLFAST-NEXT:  // %bb.2: // %B
+; LSLFAST-NEXT:    mov x0, x8
+; LSLFAST-NEXT:    bl g
+; LSLFAST-NEXT:    b .LBB1_4
+; LSLFAST-NEXT:  .LBB1_3: // %C
+; LSLFAST-NEXT:    add x0, x2, x3, lsl #4
+; LSLFAST-NEXT:    mov x1, x8
+; LSLFAST-NEXT:    bl g
+; LSLFAST-NEXT:  .LBB1_4:
+; LSLFAST-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; LSLFAST-NEXT:  .LBB1_5: // %X
+; LSLFAST-NEXT:    ret
+E:
+    %p0 = getelementptr {i64, i64}, ptr %a, i64 %i
+    br i1 %c0, label %A, label %X
+
+A:
+    br i1 %c1, label %B, label %C
+
+B:
+    call void @g(ptr %p0)
+    br label %X
+
+C:
+    %p1 = getelementptr {i64, i64}, ptr %a, i64 %i
+    call void @g(ptr %p1, ptr %p0)
+    br label %X
+
+X:
+    ret void
+}