[AArch64] Match constants in SelectSMETileSlice (#151494)

sdesmalen-arm · web-flow · commit 2ad1d77b17c3 · 2025-08-11T10:19:26.000+01:00
If the slice is a constant then it should try to use `WZR + &lt;imm&gt;`
addressing mode if the constant fits the range.
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -7617,16 +7617,29 @@ bool AArch64DAGToDAGISel::SelectAnyPredicate(SDValue N) {
 bool AArch64DAGToDAGISel::SelectSMETileSlice(SDValue N, unsigned MaxSize,
                                              SDValue &Base, SDValue &Offset,
                                              unsigned Scale) {
-  // Try to untangle an ADD node into a 'reg + offset'
-  if (CurDAG->isBaseWithConstantOffset(N))
-    if (auto C = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+  auto MatchConstantOffset = [&](SDValue CN) -> SDValue {
+    if (auto *C = dyn_cast<ConstantSDNode>(CN)) {
       int64_t ImmOff = C->getSExtValue();
-      if ((ImmOff > 0 && ImmOff <= MaxSize && (ImmOff % Scale == 0))) {
-        Base = N.getOperand(0);
-        Offset = CurDAG->getTargetConstant(ImmOff / Scale, SDLoc(N), MVT::i64);
-        return true;
-      }
+      if ((ImmOff > 0 && ImmOff <= MaxSize && (ImmOff % Scale == 0)))
+        return CurDAG->getTargetConstant(ImmOff / Scale, SDLoc(N), MVT::i64);
     }
+    return SDValue();
+  };
+
+  if (SDValue C = MatchConstantOffset(N)) {
+    Base = CurDAG->getConstant(0, SDLoc(N), MVT::i32);
+    Offset = C;
+    return true;
+  }
+
+  // Try to untangle an ADD node into a 'reg + offset'
+  if (CurDAG->isBaseWithConstantOffset(N)) {
+    if (SDValue C = MatchConstantOffset(N.getOperand(1))) {
+      Base = N.getOperand(0);
+      Offset = C;
+      return true;
+    }
+  }
 
   // By default, just match reg + 0.
   Base = N;
diff --git a/llvm/test/CodeGen/AArch64/sme-tileslice-addrmodes.ll b/llvm/test/CodeGen/AArch64/sme-tileslice-addrmodes.ll
@@ -0,0 +1,58 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mattr=+sme2 -enable-subreg-liveness < %s| FileCheck %s
+
+target triple = "aarch64"
+
+; The tile-slice addressing mode supports an immediate of 0-7.
+; This is testing an immediate of 0, 1, 7 (folded) and 8 (not folded).
+define void @sme_tileslice_addrmode_zero_base_plus_constant_offset(i32 %slice, <vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x half> %3, <vscale x 8 x half> %4) "aarch64_pstate_sm_enabled" {
+; CHECK-LABEL: sme_tileslice_addrmode_zero_base_plus_constant_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    mov w9, #8 // =0x8
+; CHECK-NEXT:    fdot za.s[w8, 0, vgx4], { z0.h - z3.h }, z4.h[0]
+; CHECK-NEXT:    fdot za.s[w8, 1, vgx4], { z0.h - z3.h }, z4.h[0]
+; CHECK-NEXT:    fdot za.s[w8, 7, vgx4], { z0.h - z3.h }, z4.h[0]
+; CHECK-NEXT:    fdot za.s[w9, 0, vgx4], { z0.h - z3.h }, z4.h[0]
+; CHECK-NEXT:    ret
+  tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x4.nxv8f16(i32 0, <vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x half> %3, <vscale x 8 x half> %4, i32 0)
+  tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x4.nxv8f16(i32 1, <vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x half> %3, <vscale x 8 x half> %4, i32 0)
+  tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x4.nxv8f16(i32 7, <vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x half> %3, <vscale x 8 x half> %4, i32 0)
+  tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x4.nxv8f16(i32 8, <vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x half> %3, <vscale x 8 x half> %4, i32 0)
+  ret void
+}
+
+; The tile-slice addressing mode supports an immediate of 0-7.
+; This is testing an immediate of 0, 1, 7 (folded) and 8 (not folded).
+define void @sme_tileslice_addrmode_base_plus_constant_offset(i32 %slice, <vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x half> %3, <vscale x 8 x half> %4) "aarch64_pstate_sm_enabled" {
+; CHECK-LABEL: sme_tileslice_addrmode_base_plus_constant_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    add w9, w0, #8
+; CHECK-NEXT:    fdot za.s[w8, 0, vgx4], { z0.h - z3.h }, z4.h[0]
+; CHECK-NEXT:    fdot za.s[w8, 1, vgx4], { z0.h - z3.h }, z4.h[0]
+; CHECK-NEXT:    fdot za.s[w8, 7, vgx4], { z0.h - z3.h }, z4.h[0]
+; CHECK-NEXT:    fdot za.s[w9, 0, vgx4], { z0.h - z3.h }, z4.h[0]
+; CHECK-NEXT:    ret
+  %slice0 = add i32 %slice, 0
+  tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x4.nxv8f16(i32 %slice0, <vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x half> %3, <vscale x 8 x half> %4, i32 0)
+  %slice1 = add i32 %slice, 1
+  tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x4.nxv8f16(i32 %slice1, <vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x half> %3, <vscale x 8 x half> %4, i32 0)
+  %slice7 = add i32 %slice, 7
+  tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x4.nxv8f16(i32 %slice7, <vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x half> %3, <vscale x 8 x half> %4, i32 0)
+  %slice8 = add i32 %slice, 8
+  tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x4.nxv8f16(i32 %slice8, <vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x half> %3, <vscale x 8 x half> %4, i32 0)
+  ret void
+}
+
+define void @sme_tileslice_addrmode_base_plus_zero_offset(i32 %slice, <vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x half> %3, <vscale x 8 x half> %4) "aarch64_pstate_sm_enabled" {
+; CHECK-LABEL: sme_tileslice_addrmode_base_plus_zero_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    fdot za.s[w8, 0, vgx4], { z0.h - z3.h }, z4.h[0]
+; CHECK-NEXT:    ret
+  tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x4.nxv8f16(i32 %slice, <vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x half> %3, <vscale x 8 x half> %4, i32 0)
+  ret void
+}
+
+declare void @llvm.aarch64.sme.fdot.lane.za32.vg1x4.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, i32 immarg)