[AArch64][SelectionDAG] Enable new partial reduction lowering by default #143565

NickGuy-Arm · 2025-06-10T16:03:11Z

No description provided.

llvmbot · 2025-06-10T16:03:43Z

@llvm/pr-subscribers-backend-aarch64

Author: Nicholas Guy (NickGuy-Arm)

Changes

Patch is 193.16 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/143565.diff

4 Files Affected:

(modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+29-51)
(modified) llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll (+919-580)
(modified) llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll (+995-931)
(modified) llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll (+59-231)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 766599d567efd..e610a63598a18 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -153,13 +153,6 @@ cl::opt<bool> EnableSVEGISel(
     cl::desc("Enable / disable SVE scalable vectors in Global ISel"),
     cl::init(false));
 
-// FIXME : This is a temporary flag, and is used to help transition to
-// performing lowering the proper way using the new PARTIAL_REDUCE_MLA ISD
-// nodes.
-static cl::opt<bool> EnablePartialReduceNodes(
-    "aarch64-enable-partial-reduce-nodes", cl::init(false), cl::ReallyHidden,
-    cl::desc("Use the new method of lowering partial reductions."));
-
 /// Value type used for condition codes.
 static const MVT MVT_CC = MVT::i32;
 
@@ -1457,7 +1450,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
       setOperationAction(ISD::FADD, VT, Custom);
 
-    if (EnablePartialReduceNodes && Subtarget->hasDotProd()) {
+    if (Subtarget->hasDotProd()) {
       static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
                                         ISD::PARTIAL_REDUCE_UMLA};
 
@@ -1895,14 +1888,13 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   }
 
   // Handle partial reduction operations
-  if (EnablePartialReduceNodes && Subtarget->isSVEorStreamingSVEAvailable()) {
+  if (Subtarget->isSVEorStreamingSVEAvailable()) {
     // Mark known legal pairs as 'Legal' (these will expand to UDOT or SDOT).
     // Other pairs will default to 'Expand'.
     static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
                                       ISD::PARTIAL_REDUCE_UMLA};
     setPartialReduceMLAAction(MLAOps, MVT::nxv2i64, MVT::nxv8i16, Legal);
     setPartialReduceMLAAction(MLAOps, MVT::nxv4i32, MVT::nxv16i8, Legal);
-
     setPartialReduceMLAAction(MLAOps, MVT::nxv2i64, MVT::nxv16i8, Custom);
 
     if (Subtarget->hasMatMulInt8()) {
@@ -1957,17 +1949,15 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, MVT::nxv2i64,
                          Custom);
 
-      if (EnablePartialReduceNodes) {
-        static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
-                                          ISD::PARTIAL_REDUCE_UMLA};
-        // Must be lowered to SVE instructions.
-        setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v4i32, Custom);
-        setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v8i16, Custom);
-        setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v16i8, Custom);
-        setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v8i16, Custom);
-        setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v16i8, Custom);
-        setPartialReduceMLAAction(MLAOps, MVT::v8i16, MVT::v16i8, Custom);
-      }
+      static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
+                                        ISD::PARTIAL_REDUCE_UMLA};
+      // Must be lowered to SVE instructions.
+      setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v4i32, Custom);
+      setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v8i16, Custom);
+      setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v16i8, Custom);
+      setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v8i16, Custom);
+      setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v16i8, Custom);
+      setPartialReduceMLAAction(MLAOps, MVT::v8i16, MVT::v16i8, Custom);
     }
   }
 
@@ -2165,16 +2155,6 @@ bool AArch64TargetLowering::shouldExpandPartialReductionIntrinsic(
   assert(I->getIntrinsicID() ==
              Intrinsic::experimental_vector_partial_reduce_add &&
          "Unexpected intrinsic!");
-  if (EnablePartialReduceNodes)
-    return true;
-
-  EVT VT = EVT::getEVT(I->getType());
-  auto Op1 = I->getOperand(1);
-  EVT Op1VT = EVT::getEVT(Op1->getType());
-  if (Op1VT.getVectorElementType() == VT.getVectorElementType() &&
-      (VT.getVectorElementCount() * 4 == Op1VT.getVectorElementCount() ||
-       VT.getVectorElementCount() * 2 == Op1VT.getVectorElementCount()))
-    return false;
   return true;
 }
 
@@ -2252,26 +2232,24 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
   bool PreferNEON = VT.is64BitVector() || VT.is128BitVector();
   bool PreferSVE = !PreferNEON && Subtarget->isSVEAvailable();
 
-  if (EnablePartialReduceNodes) {
-    static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
-                                      ISD::PARTIAL_REDUCE_UMLA};
-    unsigned NumElts = VT.getVectorNumElements();
-    if (VT.getVectorElementType() == MVT::i64) {
-      setPartialReduceMLAAction(MLAOps, VT,
-                                MVT::getVectorVT(MVT::i8, NumElts * 8), Custom);
-      setPartialReduceMLAAction(
-          MLAOps, VT, MVT::getVectorVT(MVT::i16, NumElts * 4), Custom);
-      setPartialReduceMLAAction(
-          MLAOps, VT, MVT::getVectorVT(MVT::i32, NumElts * 2), Custom);
-    } else if (VT.getVectorElementType() == MVT::i32) {
-      setPartialReduceMLAAction(MLAOps, VT,
-                                MVT::getVectorVT(MVT::i8, NumElts * 4), Custom);
-      setPartialReduceMLAAction(
-          MLAOps, VT, MVT::getVectorVT(MVT::i16, NumElts * 2), Custom);
-    } else if (VT.getVectorElementType() == MVT::i16) {
-      setPartialReduceMLAAction(MLAOps, VT,
-                                MVT::getVectorVT(MVT::i8, NumElts * 2), Custom);
-    }
+  static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
+                                    ISD::PARTIAL_REDUCE_UMLA};
+  unsigned NumElts = VT.getVectorNumElements();
+  if (VT.getVectorElementType() == MVT::i64) {
+    setPartialReduceMLAAction(MLAOps, VT,
+                              MVT::getVectorVT(MVT::i8, NumElts * 8), Custom);
+    setPartialReduceMLAAction(
+        MLAOps, VT, MVT::getVectorVT(MVT::i16, NumElts * 4), Custom);
+    setPartialReduceMLAAction(
+        MLAOps, VT, MVT::getVectorVT(MVT::i32, NumElts * 2), Custom);
+  } else if (VT.getVectorElementType() == MVT::i32) {
+    setPartialReduceMLAAction(MLAOps, VT,
+                              MVT::getVectorVT(MVT::i8, NumElts * 4), Custom);
+    setPartialReduceMLAAction(
+        MLAOps, VT, MVT::getVectorVT(MVT::i16, NumElts * 2), Custom);
+  } else if (VT.getVectorElementType() == MVT::i16) {
+    setPartialReduceMLAAction(MLAOps, VT,
+                              MVT::getVectorVT(MVT::i8, NumElts * 2), Custom);
   }
 
   // Lower fixed length vector operations to scalable equivalents.
diff --git a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
index 0c7b3c7d3c138..43404d1f871fe 100644
--- a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
+++ b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
@@ -1,15 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod < %s | FileCheck %s --check-prefixes=CHECK,CHECK-DOT,CHECK-NOI8MM
-; RUN: llc -mtriple aarch64 -mattr=+neon < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NOI8MM,CHECK-NODOT
-; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod,+i8mm < %s | FileCheck %s --check-prefixes=CHECK,CHECK-DOT,CHECK-I8MM
-; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod,+i8mm -aarch64-enable-partial-reduce-nodes < %s | FileCheck %s --check-prefixes=CHECK,CHECK-DOT,CHECK-NEWLOWERING-I8MM
+; RUN: llc -mtriple aarch64 -mattr=+neon < %s | FileCheck %s --check-prefix=CHECK-NODOT
+; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod < %s | FileCheck %s --check-prefix=CHECK-DOT
+; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod,+i8mm < %s | FileCheck %s --check-prefix=CHECK-DOT-I8MM
 
 define <4 x i32> @udot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
-; CHECK-DOT-LABEL: udot:
-; CHECK-DOT:       // %bb.0:
-; CHECK-DOT-NEXT:    udot v0.4s, v2.16b, v1.16b
-; CHECK-DOT-NEXT:    ret
-;
 ; CHECK-NODOT-LABEL: udot:
 ; CHECK-NODOT:       // %bb.0:
 ; CHECK-NODOT-NEXT:    umull v3.8h, v2.8b, v1.8b
@@ -19,6 +13,16 @@ define <4 x i32> @udot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
 ; CHECK-NODOT-NEXT:    uaddw v0.4s, v0.4s, v1.4h
 ; CHECK-NODOT-NEXT:    uaddw2 v0.4s, v0.4s, v1.8h
 ; CHECK-NODOT-NEXT:    ret
+;
+; CHECK-DOT-LABEL: udot:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    udot v0.4s, v2.16b, v1.16b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-DOT-I8MM-LABEL: udot:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    udot v0.4s, v2.16b, v1.16b
+; CHECK-DOT-I8MM-NEXT:    ret
   %u.wide = zext <16 x i8> %u to <16 x i32>
   %s.wide = zext <16 x i8> %s to <16 x i32>
   %mult = mul nuw nsw <16 x i32> %s.wide, %u.wide
@@ -27,22 +31,6 @@ define <4 x i32> @udot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
 }
 
 define <4 x i32> @udot_in_loop(ptr %p1, ptr %p2){
-; CHECK-DOT-LABEL: udot_in_loop:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-DOT-NEXT:    mov x8, xzr
-; CHECK-DOT-NEXT:  .LBB1_1: // %vector.body
-; CHECK-DOT-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-DOT-NEXT:    ldr q2, [x0, x8]
-; CHECK-DOT-NEXT:    ldr q3, [x1, x8]
-; CHECK-DOT-NEXT:    mov v0.16b, v1.16b
-; CHECK-DOT-NEXT:    add x8, x8, #16
-; CHECK-DOT-NEXT:    udot v1.4s, v2.16b, v3.16b
-; CHECK-DOT-NEXT:    cmp x8, #16
-; CHECK-DOT-NEXT:    b.ne .LBB1_1
-; CHECK-DOT-NEXT:  // %bb.2: // %end
-; CHECK-DOT-NEXT:    ret
-;
 ; CHECK-NODOT-LABEL: udot_in_loop:
 ; CHECK-NODOT:       // %bb.0: // %entry
 ; CHECK-NODOT-NEXT:    movi v1.2d, #0000000000000000
@@ -63,6 +51,38 @@ define <4 x i32> @udot_in_loop(ptr %p1, ptr %p2){
 ; CHECK-NODOT-NEXT:    b.ne .LBB1_1
 ; CHECK-NODOT-NEXT:  // %bb.2: // %end
 ; CHECK-NODOT-NEXT:    ret
+;
+; CHECK-DOT-LABEL: udot_in_loop:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-DOT-NEXT:    mov x8, xzr
+; CHECK-DOT-NEXT:  .LBB1_1: // %vector.body
+; CHECK-DOT-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-DOT-NEXT:    ldr q2, [x0, x8]
+; CHECK-DOT-NEXT:    ldr q3, [x1, x8]
+; CHECK-DOT-NEXT:    mov v0.16b, v1.16b
+; CHECK-DOT-NEXT:    add x8, x8, #16
+; CHECK-DOT-NEXT:    udot v1.4s, v2.16b, v3.16b
+; CHECK-DOT-NEXT:    cmp x8, #16
+; CHECK-DOT-NEXT:    b.ne .LBB1_1
+; CHECK-DOT-NEXT:  // %bb.2: // %end
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-DOT-I8MM-LABEL: udot_in_loop:
+; CHECK-DOT-I8MM:       // %bb.0: // %entry
+; CHECK-DOT-I8MM-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-DOT-I8MM-NEXT:    mov x8, xzr
+; CHECK-DOT-I8MM-NEXT:  .LBB1_1: // %vector.body
+; CHECK-DOT-I8MM-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-DOT-I8MM-NEXT:    ldr q2, [x0, x8]
+; CHECK-DOT-I8MM-NEXT:    ldr q3, [x1, x8]
+; CHECK-DOT-I8MM-NEXT:    mov v0.16b, v1.16b
+; CHECK-DOT-I8MM-NEXT:    add x8, x8, #16
+; CHECK-DOT-I8MM-NEXT:    udot v1.4s, v2.16b, v3.16b
+; CHECK-DOT-I8MM-NEXT:    cmp x8, #16
+; CHECK-DOT-I8MM-NEXT:    b.ne .LBB1_1
+; CHECK-DOT-I8MM-NEXT:  // %bb.2: // %end
+; CHECK-DOT-I8MM-NEXT:    ret
 entry:
   br label %vector.body
 
@@ -86,11 +106,6 @@ end:
 }
 
 define <2 x i32> @udot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
-; CHECK-DOT-LABEL: udot_narrow:
-; CHECK-DOT:       // %bb.0:
-; CHECK-DOT-NEXT:    udot v0.2s, v2.8b, v1.8b
-; CHECK-DOT-NEXT:    ret
-;
 ; CHECK-NODOT-LABEL: udot_narrow:
 ; CHECK-NODOT:       // %bb.0:
 ; CHECK-NODOT-NEXT:    umull v1.8h, v2.8b, v1.8b
@@ -105,6 +120,16 @@ define <2 x i32> @udot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
 ; CHECK-NODOT-NEXT:    uaddw v0.4s, v0.4s, v1.4h
 ; CHECK-NODOT-NEXT:    add v0.2s, v2.2s, v0.2s
 ; CHECK-NODOT-NEXT:    ret
+;
+; CHECK-DOT-LABEL: udot_narrow:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    udot v0.2s, v2.8b, v1.8b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-DOT-I8MM-LABEL: udot_narrow:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    udot v0.2s, v2.8b, v1.8b
+; CHECK-DOT-I8MM-NEXT:    ret
   %u.wide = zext <8 x i8> %u to <8 x i32>
   %s.wide = zext <8 x i8> %s to <8 x i32>
   %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide
@@ -113,11 +138,6 @@ define <2 x i32> @udot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
 }
 
 define <4 x i32> @sdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
-; CHECK-DOT-LABEL: sdot:
-; CHECK-DOT:       // %bb.0:
-; CHECK-DOT-NEXT:    sdot v0.4s, v2.16b, v1.16b
-; CHECK-DOT-NEXT:    ret
-;
 ; CHECK-NODOT-LABEL: sdot:
 ; CHECK-NODOT:       // %bb.0:
 ; CHECK-NODOT-NEXT:    smull v3.8h, v2.8b, v1.8b
@@ -127,6 +147,16 @@ define <4 x i32> @sdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
 ; CHECK-NODOT-NEXT:    saddw v0.4s, v0.4s, v1.4h
 ; CHECK-NODOT-NEXT:    saddw2 v0.4s, v0.4s, v1.8h
 ; CHECK-NODOT-NEXT:    ret
+;
+; CHECK-DOT-LABEL: sdot:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    sdot v0.4s, v2.16b, v1.16b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-DOT-I8MM-LABEL: sdot:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    sdot v0.4s, v2.16b, v1.16b
+; CHECK-DOT-I8MM-NEXT:    ret
   %u.wide = sext <16 x i8> %u to <16 x i32>
   %s.wide = sext <16 x i8> %s to <16 x i32>
   %mult = mul nuw nsw <16 x i32> %s.wide, %u.wide
@@ -135,11 +165,6 @@ define <4 x i32> @sdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
 }
 
 define <2 x i32> @sdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
-; CHECK-DOT-LABEL: sdot_narrow:
-; CHECK-DOT:       // %bb.0:
-; CHECK-DOT-NEXT:    sdot v0.2s, v2.8b, v1.8b
-; CHECK-DOT-NEXT:    ret
-;
 ; CHECK-NODOT-LABEL: sdot_narrow:
 ; CHECK-NODOT:       // %bb.0:
 ; CHECK-NODOT-NEXT:    smull v1.8h, v2.8b, v1.8b
@@ -154,6 +179,16 @@ define <2 x i32> @sdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
 ; CHECK-NODOT-NEXT:    saddw v0.4s, v0.4s, v1.4h
 ; CHECK-NODOT-NEXT:    add v0.2s, v2.2s, v0.2s
 ; CHECK-NODOT-NEXT:    ret
+;
+; CHECK-DOT-LABEL: sdot_narrow:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    sdot v0.2s, v2.8b, v1.8b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-DOT-I8MM-LABEL: sdot_narrow:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    sdot v0.2s, v2.8b, v1.8b
+; CHECK-DOT-I8MM-NEXT:    ret
   %u.wide = sext <8 x i8> %u to <8 x i32>
   %s.wide = sext <8 x i8> %s to <8 x i32>
   %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide
@@ -162,27 +197,34 @@ define <2 x i32> @sdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
 }
 
 define <4 x i32> @usdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
-; CHECK-NOI8MM-LABEL: usdot:
-; CHECK-NOI8MM:       // %bb.0:
-; CHECK-NOI8MM-NEXT:    ushll v3.8h, v1.8b, #0
-; CHECK-NOI8MM-NEXT:    sshll v4.8h, v2.8b, #0
-; CHECK-NOI8MM-NEXT:    ushll2 v1.8h, v1.16b, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v2.8h, v2.16b, #0
-; CHECK-NOI8MM-NEXT:    smlal v0.4s, v4.4h, v3.4h
-; CHECK-NOI8MM-NEXT:    smlal2 v0.4s, v4.8h, v3.8h
-; CHECK-NOI8MM-NEXT:    smlal v0.4s, v2.4h, v1.4h
-; CHECK-NOI8MM-NEXT:    smlal2 v0.4s, v2.8h, v1.8h
-; CHECK-NOI8MM-NEXT:    ret
+; CHECK-NODOT-LABEL: usdot:
+; CHECK-NODOT:       // %bb.0:
+; CHECK-NODOT-NEXT:    ushll v3.8h, v1.8b, #0
+; CHECK-NODOT-NEXT:    sshll v4.8h, v2.8b, #0
+; CHECK-NODOT-NEXT:    ushll2 v1.8h, v1.16b, #0
+; CHECK-NODOT-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-NODOT-NEXT:    smlal v0.4s, v4.4h, v3.4h
+; CHECK-NODOT-NEXT:    smlal2 v0.4s, v4.8h, v3.8h
+; CHECK-NODOT-NEXT:    smlal v0.4s, v2.4h, v1.4h
+; CHECK-NODOT-NEXT:    smlal2 v0.4s, v2.8h, v1.8h
+; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: usdot:
-; CHECK-I8MM:       // %bb.0:
-; CHECK-I8MM-NEXT:    usdot v0.4s, v1.16b, v2.16b
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: usdot:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    ushll v3.8h, v1.8b, #0
+; CHECK-DOT-NEXT:    sshll v4.8h, v2.8b, #0
+; CHECK-DOT-NEXT:    ushll2 v1.8h, v1.16b, #0
+; CHECK-DOT-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-DOT-NEXT:    smlal v0.4s, v4.4h, v3.4h
+; CHECK-DOT-NEXT:    smlal2 v0.4s, v4.8h, v3.8h
+; CHECK-DOT-NEXT:    smlal v0.4s, v2.4h, v1.4h
+; CHECK-DOT-NEXT:    smlal2 v0.4s, v2.8h, v1.8h
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: usdot:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0:
-; CHECK-NEWLOWERING-I8MM-NEXT:    usdot v0.4s, v1.16b, v2.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-DOT-I8MM-LABEL: usdot:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    usdot v0.4s, v1.16b, v2.16b
+; CHECK-DOT-I8MM-NEXT:    ret
   %u.wide = zext <16 x i8> %u to <16 x i32>
   %s.wide = sext <16 x i8> %s to <16 x i32>
   %mult = mul nuw nsw <16 x i32> %s.wide, %u.wide
@@ -191,60 +233,67 @@ define <4 x i32> @usdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
 }
 
 define <4 x i32> @usdot_in_loop(ptr %p1, ptr %p2){
-; CHECK-NOI8MM-LABEL: usdot_in_loop:
-; CHECK-NOI8MM:       // %bb.0: // %entry
-; CHECK-NOI8MM-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NOI8MM-NEXT:    mov x8, xzr
-; CHECK-NOI8MM-NEXT:  .LBB6_1: // %vector.body
-; CHECK-NOI8MM-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NOI8MM-NEXT:    ldr q2, [x0, x8]
-; CHECK-NOI8MM-NEXT:    ldr q3, [x1, x8]
-; CHECK-NOI8MM-NEXT:    mov v0.16b, v1.16b
-; CHECK-NOI8MM-NEXT:    add x8, x8, #16
-; CHECK-NOI8MM-NEXT:    sshll v4.8h, v2.8b, #0
-; CHECK-NOI8MM-NEXT:    ushll v5.8h, v3.8b, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v2.8h, v2.16b, #0
-; CHECK-NOI8MM-NEXT:    ushll2 v3.8h, v3.16b, #0
-; CHECK-NOI8MM-NEXT:    cmp x8, #16
-; CHECK-NOI8MM-NEXT:    smlal v1.4s, v4.4h, v5.4h
-; CHECK-NOI8MM-NEXT:    smlal2 v1.4s, v4.8h, v5.8h
-; CHECK-NOI8MM-NEXT:    smlal v1.4s, v2.4h, v3.4h
-; CHECK-NOI8MM-NEXT:    smlal2 v1.4s, v2.8h, v3.8h
-; CHECK-NOI8MM-NEXT:    b.ne .LBB6_1
-; CHECK-NOI8MM-NEXT:  // %bb.2: // %end
-; CHECK-NOI8MM-NEXT:    ret
+; CHECK-NODOT-LABEL: usdot_in_loop:
+; CHECK-NODOT:       // %bb.0: // %entry
+; CHECK-NODOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NODOT-NEXT:    mov x8, xzr
+; CHECK-NODOT-NEXT:  .LBB6_1: // %vector.body
+; CHECK-NODOT-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NODOT-NEXT:    ldr q2, [x0, x8]
+; CHECK-NODOT-NEXT:    ldr q3, [x1, x8]
+; CHECK-NODOT-NEXT:    mov v0.16b, v1.16b
+; CHECK-NODOT-NEXT:    add x8, x8, #16
+; CHECK-NODOT-NEXT:    sshll v4.8h, v2.8b, #0
+; CHECK-NODOT-NEXT:    ushll v5.8h, v3.8b, #0
+; CHECK-NODOT-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-NODOT-NEXT:    ushll2 v3.8h, v3.16b, #0
+; CHECK-NODOT-NEXT:    cmp x8, #16
+; CHECK-NODOT-NEXT:    smlal v1.4s, v4.4h, v5.4h
+; CHECK-NODOT-NEXT:    smlal2 v1.4s, v4.8h, v5.8h
+; CHECK-NODOT-NEXT:    smlal v1.4s, v2.4h, v3.4h
+; CHECK-NODOT-NEXT:    smlal2 v1.4s, v2.8h, v3.8h
+; CHECK-NODOT-NEXT:    b.ne .LBB6_1
+; CHECK-NODOT-NEXT:  // %bb.2: // %end
+; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: usdot_in_loop:
-; CHECK-I8MM:       // %bb.0: // %entry
-; CHECK-I8MM-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-I8MM-NEXT:    mov x8, xzr
-; CHECK-I8MM-NEXT:  .LBB6_1: // %vector.body
-; CHECK-I8MM-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-I8MM-NEXT:    ldr q2, [x0, x8]
-; CHECK-I8MM-NEXT:    ldr q3, [x1, x8]
-; CHECK-I8MM-NEXT:    mov v0.16b, v1.16b
-; CHECK-I8MM-NEXT:    add x8, x8, #16
-; CHECK-I8MM-NEXT:    usdot v1.4s, v3.16b, v2.16b
-; CHECK-I8MM-NEXT:    cmp x8, #16
-; CHECK-I8MM-NEXT:    b.ne .LBB6_1
-; CHECK-I8MM-NEXT:  // %bb.2: // %end
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: usdot_in_loop:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-DOT-NEXT:    mov x8, xzr
+; CHECK-DOT-NEXT:  .LBB6_1: // %vector.body
+; CHECK-DOT-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-DOT-NEXT:    ldr q2, [x0, x8]
+; CHECK-DOT-NEXT:    ldr q3, [x1, x8]
+; CHECK-DOT-NEXT:    mov v0.16b, v1.16b
+; CHECK-DOT-NEXT:    add x8, x8, #16
+; CHECK-DOT-NEXT:    sshll v4.8h, v2.8b, #0
+; CHECK-DOT-NEXT:    ushll v5.8h, v3.8b, #0
+; CHECK-DOT-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-DOT-NEXT:    ushll2 v3.8h, v3.16b, #0
+; CHECK-DOT-NEXT:    cmp x8, #16
+; CHECK-DOT-NEXT:    smlal v1.4s, v4.4h, v5.4h
+; CHECK-DOT-NEXT:    smlal2 v1.4s, v4.8h, v5.8h
+; CHECK-DOT-NEXT:    smlal v1.4s, v2.4h, v3.4h
+; CHECK-DOT-NEXT:    smlal2 v1.4s, v2.8h, v3.8h
+; CHECK-DOT-NEXT:    b.ne .LBB6_1
+; CHECK-DOT-NEXT:  // %bb.2: // %end
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: usdot_in_loop:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-I8MM-NEXT:    movi v1.2d, #000...
[truncated]

github-actions · 2025-06-10T16:07:31Z

✅ With the latest revision this PR passed the C/C++ code formatter.

SamTebbs33

LGTM.

llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll

llvm-ci · 2025-06-16T15:54:10Z

LLVM Buildbot has detected a new failure on builder mlir-nvidia running on mlir-nvidia while building llvm at step 7 "test-build-check-mlir-build-only-check-mlir".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/138/builds/14628

Here is the relevant piece of the build log for the reference

Step 7 (test-build-check-mlir-build-only-check-mlir) failure: test (failure)
******************** TEST 'MLIR :: Integration/GPU/CUDA/async.mlir' FAILED ********************
Exit Code: 1

Command Output (stdout):
--
# RUN: at line 1
/vol/worker/mlir-nvidia/mlir-nvidia/llvm.obj/bin/mlir-opt /vol/worker/mlir-nvidia/mlir-nvidia/llvm.src/mlir/test/Integration/GPU/CUDA/async.mlir  | /vol/worker/mlir-nvidia/mlir-nvidia/llvm.obj/bin/mlir-opt -gpu-kernel-outlining  | /vol/worker/mlir-nvidia/mlir-nvidia/llvm.obj/bin/mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm),nvvm-attach-target)'  | /vol/worker/mlir-nvidia/mlir-nvidia/llvm.obj/bin/mlir-opt -gpu-async-region -gpu-to-llvm -reconcile-unrealized-casts -gpu-module-to-binary="format=fatbin"  | /vol/worker/mlir-nvidia/mlir-nvidia/llvm.obj/bin/mlir-opt -async-to-async-runtime -async-runtime-ref-counting  | /vol/worker/mlir-nvidia/mlir-nvidia/llvm.obj/bin/mlir-opt -convert-async-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -convert-cf-to-llvm -reconcile-unrealized-casts  | /vol/worker/mlir-nvidia/mlir-nvidia/llvm.obj/bin/mlir-runner    --shared-libs=/vol/worker/mlir-nvidia/mlir-nvidia/llvm.obj/lib/libmlir_cuda_runtime.so    --shared-libs=/vol/worker/mlir-nvidia/mlir-nvidia/llvm.obj/lib/libmlir_async_runtime.so    --shared-libs=/vol/worker/mlir-nvidia/mlir-nvidia/llvm.obj/lib/libmlir_runner_utils.so    --entry-point-result=void -O0  | /vol/worker/mlir-nvidia/mlir-nvidia/llvm.obj/bin/FileCheck /vol/worker/mlir-nvidia/mlir-nvidia/llvm.src/mlir/test/Integration/GPU/CUDA/async.mlir
# executed command: /vol/worker/mlir-nvidia/mlir-nvidia/llvm.obj/bin/mlir-opt /vol/worker/mlir-nvidia/mlir-nvidia/llvm.src/mlir/test/Integration/GPU/CUDA/async.mlir
# executed command: /vol/worker/mlir-nvidia/mlir-nvidia/llvm.obj/bin/mlir-opt -gpu-kernel-outlining
# executed command: /vol/worker/mlir-nvidia/mlir-nvidia/llvm.obj/bin/mlir-opt '-pass-pipeline=builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm),nvvm-attach-target)'
# executed command: /vol/worker/mlir-nvidia/mlir-nvidia/llvm.obj/bin/mlir-opt -gpu-async-region -gpu-to-llvm -reconcile-unrealized-casts -gpu-module-to-binary=format=fatbin
# executed command: /vol/worker/mlir-nvidia/mlir-nvidia/llvm.obj/bin/mlir-opt -async-to-async-runtime -async-runtime-ref-counting
# executed command: /vol/worker/mlir-nvidia/mlir-nvidia/llvm.obj/bin/mlir-opt -convert-async-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -convert-cf-to-llvm -reconcile-unrealized-casts
# executed command: /vol/worker/mlir-nvidia/mlir-nvidia/llvm.obj/bin/mlir-runner --shared-libs=/vol/worker/mlir-nvidia/mlir-nvidia/llvm.obj/lib/libmlir_cuda_runtime.so --shared-libs=/vol/worker/mlir-nvidia/mlir-nvidia/llvm.obj/lib/libmlir_async_runtime.so --shared-libs=/vol/worker/mlir-nvidia/mlir-nvidia/llvm.obj/lib/libmlir_runner_utils.so --entry-point-result=void -O0
# .---command stderr------------
# | 'cuStreamWaitEvent(stream, event, 0)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuEventDestroy(event)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuStreamWaitEvent(stream, event, 0)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuEventDestroy(event)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuStreamWaitEvent(stream, event, 0)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuStreamWaitEvent(stream, event, 0)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuEventDestroy(event)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuEventDestroy(event)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuEventSynchronize(event)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# | 'cuEventDestroy(event)' failed with 'CUDA_ERROR_CONTEXT_IS_DESTROYED'
# `-----------------------------
# executed command: /vol/worker/mlir-nvidia/mlir-nvidia/llvm.obj/bin/FileCheck /vol/worker/mlir-nvidia/mlir-nvidia/llvm.src/mlir/test/Integration/GPU/CUDA/async.mlir
# .---command stderr------------
# | /vol/worker/mlir-nvidia/mlir-nvidia/llvm.src/mlir/test/Integration/GPU/CUDA/async.mlir:68:12: error: CHECK: expected string not found in input
# |  // CHECK: [84, 84]
# |            ^
# | <stdin>:1:1: note: scanning from here
# | Unranked Memref base@ = 0x57d37e4a9770 rank = 1 offset = 0 sizes = [2] strides = [1] data = 
# | ^
# | <stdin>:2:1: note: possible intended match here
# | [42, 42]
# | ^
# | 
# | Input file: <stdin>
# | Check file: /vol/worker/mlir-nvidia/mlir-nvidia/llvm.src/mlir/test/Integration/GPU/CUDA/async.mlir
# | 
# | -dump-input=help explains the following input dump.
# | 
# | Input was:
# | <<<<<<
# |             1: Unranked Memref base@ = 0x57d37e4a9770 rank = 1 offset = 0 sizes = [2] strides = [1] data =  
# | check:68'0     X~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ error: no match found
# |             2: [42, 42] 
# | check:68'0     ~~~~~~~~~
# | check:68'1     ?         possible intended match
...

david-arm · 2025-06-19T13:51:09Z

llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll

+; CHECK-COMMON-NEXT:    mov v0.16b, v2.16b
+; CHECK-COMMON-NEXT:    add x8, x8, #16
+; CHECK-COMMON-NEXT:    cmp x8, #16
+; CHECK-COMMON-NEXT:    tbl v7.16b, { v6.16b }, v3.16b


This is a serious regression. It used to generate udot before.

@MDevereau is looking into fixing this!

Opened #144907

…ult (llvm#143565)

NickGuy-Arm requested review from MacDue, SamTebbs33 and sdesmalen-arm June 10, 2025 16:03

llvmbot added the backend:AArch64 label Jun 10, 2025

SamTebbs33 approved these changes Jun 11, 2025

View reviewed changes

llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll Outdated Show resolved Hide resolved

llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll Outdated Show resolved Hide resolved

llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll Outdated Show resolved Hide resolved

NickGuy-Arm added 2 commits June 16, 2025 14:19

[AArch64][SelectionDAG] Enable new partial reduction lowering by default

615174d

Update test

34819d9

NickGuy-Arm force-pushed the JamesChesterman/legal-partial-reduction/enable-default branch from a58925f to 34819d9 Compare June 16, 2025 13:38

clang-format

cd043b3

SamTebbs33 approved these changes Jun 16, 2025

View reviewed changes

NickGuy-Arm merged commit a5f0525 into llvm:main Jun 16, 2025
7 checks passed

david-arm reviewed Jun 19, 2025

View reviewed changes

akuhlens pushed a commit to akuhlens/llvm-project that referenced this pull request Jun 24, 2025

[AArch64][SelectionDAG] Enable new partial reduction lowering by defa…

405833c

…ult (llvm#143565)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[AArch64][SelectionDAG] Enable new partial reduction lowering by default #143565

[AArch64][SelectionDAG] Enable new partial reduction lowering by default #143565

Uh oh!

NickGuy-Arm commented Jun 10, 2025

Uh oh!

llvmbot commented Jun 10, 2025

Uh oh!

github-actions bot commented Jun 10, 2025 •

edited

Loading

Uh oh!

SamTebbs33 left a comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

llvm-ci commented Jun 16, 2025

Uh oh!

david-arm Jun 19, 2025

Uh oh!

david-arm Jun 19, 2025

Uh oh!

MDevereau Jun 19, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

6 participants

[AArch64][SelectionDAG] Enable new partial reduction lowering by default #143565

[AArch64][SelectionDAG] Enable new partial reduction lowering by default #143565

Uh oh!

Conversation

NickGuy-Arm commented Jun 10, 2025

Uh oh!

llvmbot commented Jun 10, 2025

Uh oh!

github-actions bot commented Jun 10, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

SamTebbs33 left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

llvm-ci commented Jun 16, 2025

Uh oh!

david-arm Jun 19, 2025

Choose a reason for hiding this comment

Uh oh!

david-arm Jun 19, 2025

Choose a reason for hiding this comment

Uh oh!

MDevereau Jun 19, 2025

Choose a reason for hiding this comment

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

6 participants

github-actions bot commented Jun 10, 2025 •

edited

Loading