Skip to content

Conversation

JoshdRod
Copy link
Contributor

@JoshdRod JoshdRod commented Oct 9, 2025

GlobalISel now selects usdot intrinsic, without falling back to SDAG.

@llvmbot
Copy link
Member

llvmbot commented Oct 9, 2025

@llvm/pr-subscribers-backend-aarch64

Author: Joshua Rodriguez (JoshdRod)

Changes

GlobalISel now selects usdot intrinsic, without falling back to SDAG.


Full diff: https://github.com/llvm/llvm-project/pull/162615.diff

5 Files Affected:

  • (modified) llvm/lib/Target/AArch64/AArch64InstrGISel.td (+7)
  • (modified) llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp (+2)
  • (modified) llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp (+4)
  • (modified) llvm/test/CodeGen/AArch64/aarch64-matmul.ll (+62-27)
  • (modified) llvm/test/CodeGen/AArch64/arm64-vadd.ll (+1-4)
diff --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
index 7322212c5bb24..fe8419301b306 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrGISel.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
@@ -233,6 +233,12 @@ def G_SDOT : AArch64GenericInstruction {
   let hasSideEffects = 0;
 }
 
+def G_USDOT : AArch64GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src1, type0:$src2, type0:$src3);
+  let hasSideEffects = 0;
+}
+
 // Generic instruction for the BSP pseudo. It is expanded into BSP, which
 // expands into BSL/BIT/BIF after register allocation.
 def G_BSP : AArch64GenericInstruction {
@@ -278,6 +284,7 @@ def : GINodeEquiv<G_UADDLV, AArch64uaddlv>;
 
 def : GINodeEquiv<G_UDOT, AArch64udot>;
 def : GINodeEquiv<G_SDOT, AArch64sdot>;
+def : GINodeEquiv<G_USDOT, AArch64usdot>;
 
 def : GINodeEquiv<G_EXTRACT_VECTOR_ELT, vector_extract>;
 
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index ea2196a584127..acf80e1f582b6 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -1855,6 +1855,8 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
     return LowerTriOp(AArch64::G_UDOT);
   case Intrinsic::aarch64_neon_sdot:
     return LowerTriOp(AArch64::G_SDOT);
+  case Intrinsic::aarch64_neon_usdot:
+    return LowerTriOp(AArch64::G_USDOT);
   case Intrinsic::aarch64_neon_sqxtn:
     return LowerUnaryOp(TargetOpcode::G_TRUNC_SSAT_S);
   case Intrinsic::aarch64_neon_sqxtun:
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index f90bcc7a77cdf..830a35bbeb494 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -590,6 +590,8 @@ bool AArch64RegisterBankInfo::onlyDefinesFP(const MachineInstr &MI,
                                             unsigned Depth) const {
   switch (MI.getOpcode()) {
   case AArch64::G_DUP:
+  case AArch64::G_SADDLP:
+  case AArch64::G_UADDLP:
   case TargetOpcode::G_SITOFP:
   case TargetOpcode::G_UITOFP:
   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
@@ -798,6 +800,8 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     if (Ty.isVector())
       OpRegBankIdx[Idx] = PMI_FirstFPR;
     else if (isPreISelGenericFloatingPointOpcode(Opc) ||
+             (MO.isDef() && onlyDefinesFP(MI, MRI, TRI)) ||
+             (MO.isUse() && onlyUsesFP(MI, MRI, TRI)) ||
              Ty.getSizeInBits() > 64)
       OpRegBankIdx[Idx] = PMI_FirstFPR;
     else
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matmul.ll b/llvm/test/CodeGen/AArch64/aarch64-matmul.ll
index 649d0a9bfcab4..5329b2d1f2e3c 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matmul.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matmul.ll
@@ -1,41 +1,54 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon,+i8mm < %s -o -| FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+i8mm    < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+i8mm -global-isel < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define <4 x i32> @smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: smmla.v4i32.v16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smmla v0.4s, v1.16b, v2.16b
+; CHECK-NEXT:    ret
 entry:
-; CHECK-LABEL: smmla.v4i32.v16i8
-; CHECK: smmla   v0.4s, v1.16b, v2.16b
   %vmmla1.i = tail call <4 x i32> @llvm.aarch64.neon.smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
   ret <4 x i32> %vmmla1.i
 }
 
 define <4 x i32> @ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: ummla.v4i32.v16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ummla v0.4s, v1.16b, v2.16b
+; CHECK-NEXT:    ret
 entry:
-; CHECK-LABEL: ummla.v4i32.v16i8
-; CHECK: ummla   v0.4s, v1.16b, v2.16b
   %vmmla1.i = tail call <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
   ret <4 x i32> %vmmla1.i
 }
 
 define <4 x i32> @usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: usmmla.v4i32.v16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    usmmla v0.4s, v1.16b, v2.16b
+; CHECK-NEXT:    ret
 entry:
-; CHECK-LABEL: usmmla.v4i32.v16i8
-; CHECK: usmmla   v0.4s, v1.16b, v2.16b
   %vusmmla1.i = tail call <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) #3
   ret <4 x i32> %vusmmla1.i
 }
 
 define <2 x i32> @usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: usdot.v2i32.v8i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    usdot v0.2s, v1.8b, v2.8b
+; CHECK-NEXT:    ret
 entry:
-; CHECK-LABEL: usdot.v2i32.v8i8
-; CHECK: usdot   v0.2s, v1.8b, v2.8b
   %vusdot1.i = tail call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b)
   ret <2 x i32> %vusdot1.i
 }
 
 define <2 x i32> @usdot_lane.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: usdot_lane.v2i32.v8i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    usdot v0.2s, v1.8b, v2.4b[0]
+; CHECK-NEXT:    ret
 entry:
-; CHECK-LABEL: usdot_lane.v2i32.v8i8
-; CHECK: usdot   v0.2s, v1.8b, v2.4b[0]
   %0 = bitcast <8 x i8> %b to <2 x i32>
   %shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <2 x i32> zeroinitializer
   %1 = bitcast <2 x i32> %shuffle to <8 x i8>
@@ -44,9 +57,12 @@ entry:
 }
 
 define <2 x i32> @sudot_lane.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: sudot_lane.v2i32.v8i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    sudot v0.2s, v1.8b, v2.4b[0]
+; CHECK-NEXT:    ret
 entry:
-; CHECK-LABEL: sudot_lane.v2i32.v8i8
-; CHECK: sudot   v0.2s, v1.8b, v2.4b[0]
   %0 = bitcast <8 x i8> %b to <2 x i32>
   %shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <2 x i32> zeroinitializer
   %1 = bitcast <2 x i32> %shuffle to <8 x i8>
@@ -55,9 +71,11 @@ entry:
 }
 
 define <2 x i32> @usdot_lane.v2i32.v16i8(<2 x i32> %r, <8 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: usdot_lane.v2i32.v16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    usdot v0.2s, v1.8b, v2.4b[0]
+; CHECK-NEXT:    ret
 entry:
-; CHECK-LABEL: usdot_lane.v2i32.v16i8
-; CHECK: usdot   v0.2s, v1.8b, v2.4b[0]
   %0 = bitcast <16 x i8> %b to <4 x i32>
   %shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> zeroinitializer
   %1 = bitcast <2 x i32> %shuffle to <8 x i8>
@@ -66,9 +84,11 @@ entry:
 }
 
 define <2 x i32> @sudot_lane.v2i32.v16i8(<2 x i32> %r, <8 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: sudot_lane.v2i32.v16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sudot v0.2s, v1.8b, v2.4b[0]
+; CHECK-NEXT:    ret
 entry:
-; CHECK-LABEL: sudot_lane.v2i32.v16i8
-; CHECK: sudot   v0.2s, v1.8b, v2.4b[0]
   %0 = bitcast <16 x i8> %b to <4 x i32>
   %shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> zeroinitializer
   %1 = bitcast <2 x i32> %shuffle to <8 x i8>
@@ -77,17 +97,22 @@ entry:
 }
 
 define <4 x i32> @usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: usdot.v4i32.v16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    usdot v0.4s, v1.16b, v2.16b
+; CHECK-NEXT:    ret
 entry:
-; CHECK-LABEL: usdot.v4i32.v16i8
-; CHECK: usdot   v0.4s, v1.16b, v2.16b
   %vusdot1.i = tail call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) #3
   ret <4 x i32> %vusdot1.i
 }
 
 define <4 x i32> @usdot_lane.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: usdot_lane.v4i32.v16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    usdot v0.4s, v1.16b, v2.4b[0]
+; CHECK-NEXT:    ret
 entry:
-; CHECK-LABEL: usdot_lane.v4i32.v16i8
-; CHECK: usdot   v0.4s, v1.16b, v2.4b[0]
   %0 = bitcast <8 x i8> %b to <2 x i32>
   %shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> zeroinitializer
   %1 = bitcast <4 x i32> %shuffle to <16 x i8>
@@ -96,9 +121,12 @@ entry:
 }
 
 define <4 x i32> @sudot_lane.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: sudot_lane.v4i32.v16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    sudot v0.4s, v1.16b, v2.4b[0]
+; CHECK-NEXT:    ret
 entry:
-; CHECK-LABEL: sudot_lane.v4i32.v16i8
-; CHECK: sudot   v0.4s, v1.16b, v2.4b[0]
   %0 = bitcast <8 x i8> %b to <2 x i32>
   %shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> zeroinitializer
   %1 = bitcast <4 x i32> %shuffle to <16 x i8>
@@ -107,9 +135,11 @@ entry:
 }
 
 define <4 x i32> @usdot_laneq.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: usdot_laneq.v4i32.v16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    usdot v0.4s, v1.16b, v2.4b[0]
+; CHECK-NEXT:    ret
 entry:
-; CHECK-LABEL: usdot_laneq.v4i32.v16i8
-; CHECK: usdot   v0.4s, v1.16b, v2.4b[0]
   %0 = bitcast <16 x i8> %b to <4 x i32>
   %shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer
   %1 = bitcast <4 x i32> %shuffle to <16 x i8>
@@ -118,9 +148,11 @@ entry:
 }
 
 define <4 x i32> @sudot_laneq.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: sudot_laneq.v4i32.v16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sudot v0.4s, v1.16b, v2.4b[0]
+; CHECK-NEXT:    ret
 entry:
-; CHECK-LABEL: sudot_laneq.v4i32.v16i8
-; CHECK: sudot   v0.4s, v1.16b, v2.4b[0]
   %0 = bitcast <16 x i8> %b to <4 x i32>
   %shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer
   %1 = bitcast <4 x i32> %shuffle to <16 x i8>
@@ -134,3 +166,6 @@ declare <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32>, <16 x i8>, <1
 declare <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32>, <8 x i8>, <8 x i8>) #2
 declare <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2
 
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK-GI: {{.*}}
+; CHECK-SD: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/arm64-vadd.ll b/llvm/test/CodeGen/AArch64/arm64-vadd.ll
index 11fb73237da07..e3c80256feea0 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vadd.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vadd.ll
@@ -1,9 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc < %s -mtriple=arm64-eabi | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc < %s -mtriple=arm64-eabi -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
-
-; CHECK-GI:       warning: Instruction selection used fallback path for saddlp1d
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for uaddlp1d
+; RUN: llc < %s -mtriple=arm64-eabi -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define <8 x i8> @addhn8b(ptr %A, ptr %B) nounwind {
 ; CHECK-LABEL: addhn8b:

@hazzlim hazzlim self-requested a review October 9, 2025 09:10
@JoshdRod JoshdRod force-pushed the add-usdot-intrinsic branch from eacff9e to 1ba9bc6 Compare October 9, 2025 10:15
@JoshdRod JoshdRod requested a review from hazzlim October 9, 2025 10:17
Copy link
Contributor

@hazzlim hazzlim left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks good to me - thanks for the fix :)

@JoshdRod JoshdRod force-pushed the add-usdot-intrinsic branch 2 times, most recently from 7f07643 to ec75810 Compare October 14, 2025 09:04
Copy link
Contributor

@usha1830 usha1830 left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM! Thanks.

@JoshdRod JoshdRod force-pushed the add-usdot-intrinsic branch from ec75810 to cb70a27 Compare October 15, 2025 15:43
GlobalISel now selects usdot intrinsic, without falling back to SDAG.
@JoshdRod JoshdRod force-pushed the add-usdot-intrinsic branch from cb70a27 to 00f161d Compare October 15, 2025 16:27
@usha1830 usha1830 merged commit ed8ad46 into llvm:main Oct 15, 2025
8 of 9 checks passed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

5 participants