[AArch64] Combine vector add(trunc(shift)) #169523

davemgreen · 2025-11-25T16:54:08Z

This adds a combine for
add(trunc(ashr(A, C)), trunc(lshr(A, BW-1))), with C >= BW
->
X = trunc(ashr(A, C)); add(x, lshr(X, BW-1)

The original converts into ashr+lshr+xtn+xtn+add. The second becomes
ashr+xtn+usra. The first form has less total latency due to more parallelism,
but more micro-ops and seems to be slower in practice.

llvmbot · 2025-11-25T16:54:41Z

@llvm/pr-subscribers-backend-aarch64

Author: David Green (davemgreen)

Changes

This adds a combine for
add(trunc(ashr(A, C)), trunc(lshr(B, BW-1))), with C >= BW
->
X = trunc(ashr(A, C)); add(x, lshr(X, BW-1)

The original converts into ashr+lshr+xtn+xtn+add. The second becomes
ashr+xtn+usra. The first form has less total latency due to more parallelism,
but more micro-ops and seems to be slower in practice.

Full diff: https://github.com/llvm/llvm-project/pull/169523.diff

2 Files Affected:

(modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+34-1)
(added) llvm/test/CodeGen/AArch64/addtruncshift.ll (+114)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index e91f5a877b35b..8026b5e542f27 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -105,6 +105,7 @@
 #include <vector>
 
 using namespace llvm;
+using namespace llvm::SDPatternMatch;
 
 #define DEBUG_TYPE "aarch64-lower"
 
@@ -22595,6 +22596,37 @@ static SDValue performSubWithBorrowCombine(SDNode *N, SelectionDAG &DAG) {
                      Flags);
 }
 
+// add(trunc(ashr(A, C)), trunc(lshr(B, BW-1))), with C >= BW
+// ->
+// X = trunc(ashr(A, C)); add(x, lshr(X, BW-1)
+// The original converts into ashr+lshr+xtn+xtn+add. The second becomes
+// ashr+xtn+usra. The first form has less total latency due to more parallelism,
+// but more micro-ops and seems to be slower in practice.
+static SDValue performAddTrunkShiftCombine(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::v2i32 && VT != MVT::v4i16 && VT != MVT::v8i8)
+    return SDValue();
+
+  SDValue AShr, LShr;
+  if (!sd_match(N, m_Add(m_Trunc(m_Value(AShr)), m_Trunc(m_Value(LShr)))))
+    return SDValue();
+  if (AShr.getOpcode() != AArch64ISD::VASHR)
+    std::swap(AShr, LShr);
+  if (AShr.getOpcode() != AArch64ISD::VASHR ||
+      LShr.getOpcode() != AArch64ISD::VLSHR ||
+      AShr.getOperand(0) != LShr.getOperand(0) ||
+      AShr.getConstantOperandVal(1) < VT.getScalarSizeInBits() ||
+      LShr.getConstantOperandVal(1) != VT.getScalarSizeInBits() * 2 - 1)
+    return SDValue();
+
+  SDLoc DL(N);
+  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, AShr);
+  SDValue Shift = DAG.getNode(
+      AArch64ISD::VLSHR, DL, VT, Trunc,
+      DAG.getTargetConstant(VT.getScalarSizeInBits() - 1, DL, MVT::i32));
+  return DAG.getNode(ISD::ADD, DL, VT, Trunc, Shift);
+}
+
 static SDValue performAddSubCombine(SDNode *N,
                                     TargetLowering::DAGCombinerInfo &DCI) {
   // Try to change sum of two reductions.
@@ -22618,6 +22650,8 @@ static SDValue performAddSubCombine(SDNode *N,
     return Val;
   if (SDValue Val = performSubWithBorrowCombine(N, DCI.DAG))
     return Val;
+  if (SDValue Val = performAddTrunkShiftCombine(N, DCI.DAG))
+    return Val;
 
   if (SDValue Val = performExtBinopLoadFold(N, DCI.DAG))
     return Val;
@@ -28125,7 +28159,6 @@ static SDValue performRNDRCombine(SDNode *N, SelectionDAG &DAG) {
 static SDValue performCTPOPCombine(SDNode *N,
                                    TargetLowering::DAGCombinerInfo &DCI,
                                    SelectionDAG &DAG) {
-  using namespace llvm::SDPatternMatch;
   if (!DCI.isBeforeLegalize())
     return SDValue();
 
diff --git a/llvm/test/CodeGen/AArch64/addtruncshift.ll b/llvm/test/CodeGen/AArch64/addtruncshift.ll
new file mode 100644
index 0000000000000..6dbe0b3d80b9a
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/addtruncshift.ll
@@ -0,0 +1,114 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-none-elf < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64-none-elf -global-isel < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+define <2 x i32> @test_v2i64(<2 x i64> %n) {
+; CHECK-SD-LABEL: test_v2i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sshr v0.2d, v0.2d, #35
+; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
+; CHECK-SD-NEXT:    usra v0.2s, v0.2s, #31
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_v2i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushr v1.2d, v0.2d, #63
+; CHECK-GI-NEXT:    sshr v0.2d, v0.2d, #35
+; CHECK-GI-NEXT:    xtn v1.2s, v1.2d
+; CHECK-GI-NEXT:    xtn v0.2s, v0.2d
+; CHECK-GI-NEXT:    add v0.2s, v1.2s, v0.2s
+; CHECK-GI-NEXT:    ret
+entry:
+  %shr = lshr <2 x i64> %n, splat (i64 63)
+  %vmovn.i4 = trunc nuw nsw <2 x i64> %shr to <2 x i32>
+  %shr1 = ashr <2 x i64> %n, splat (i64 35)
+  %vmovn.i = trunc nsw <2 x i64> %shr1 to <2 x i32>
+  %add = add nsw <2 x i32> %vmovn.i4, %vmovn.i
+  ret <2 x i32> %add
+}
+
+define <4 x i16> @test_v4i32(<4 x i32> %n) {
+; CHECK-SD-LABEL: test_v4i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sshr v0.4s, v0.4s, #17
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-SD-NEXT:    usra v0.4h, v0.4h, #15
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_v4i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushr v1.4s, v0.4s, #31
+; CHECK-GI-NEXT:    sshr v0.4s, v0.4s, #17
+; CHECK-GI-NEXT:    xtn v1.4h, v1.4s
+; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    add v0.4h, v1.4h, v0.4h
+; CHECK-GI-NEXT:    ret
+entry:
+  %shr = lshr <4 x i32> %n, splat (i32 31)
+  %vmovn.i4 = trunc nuw nsw <4 x i32> %shr to <4 x i16>
+  %shr1 = ashr <4 x i32> %n, splat (i32 17)
+  %vmovn.i = trunc nsw <4 x i32> %shr1 to <4 x i16>
+  %add = add nsw <4 x i16> %vmovn.i4, %vmovn.i
+  ret <4 x i16> %add
+}
+
+define <8 x i8> @test_v8i16(<8 x i16> %n) {
+; CHECK-SD-LABEL: test_v8i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sshr v0.8h, v0.8h, #9
+; CHECK-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-SD-NEXT:    usra v0.8b, v0.8b, #7
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_v8i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushr v1.8h, v0.8h, #15
+; CHECK-GI-NEXT:    sshr v0.8h, v0.8h, #9
+; CHECK-GI-NEXT:    xtn v1.8b, v1.8h
+; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-GI-NEXT:    add v0.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    ret
+entry:
+  %shr = lshr <8 x i16> %n, splat (i16 15)
+  %vmovn.i4 = trunc nuw nsw <8 x i16> %shr to <8 x i8>
+  %shr1 = ashr <8 x i16> %n, splat (i16 9)
+  %vmovn.i = trunc nsw <8 x i16> %shr1 to <8 x i8>
+  %add = add nsw <8 x i8> %vmovn.i4, %vmovn.i
+  ret <8 x i8> %add
+}
+
+define <2 x i32> @test_v2i64_smallsrl(<2 x i64> %n) {
+; CHECK-LABEL: test_v2i64_smallsrl:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushr v1.2d, v0.2d, #62
+; CHECK-NEXT:    sshr v0.2d, v0.2d, #35
+; CHECK-NEXT:    xtn v1.2s, v1.2d
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    add v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    ret
+entry:
+  %shr = lshr <2 x i64> %n, splat (i64 62)
+  %vmovn.i4 = trunc nuw nsw <2 x i64> %shr to <2 x i32>
+  %shr1 = ashr <2 x i64> %n, splat (i64 35)
+  %vmovn.i = trunc nsw <2 x i64> %shr1 to <2 x i32>
+  %add = add nsw <2 x i32> %vmovn.i4, %vmovn.i
+  ret <2 x i32> %add
+}
+
+define <2 x i32> @test_v2i64_smallsra(<2 x i64> %n) {
+; CHECK-LABEL: test_v2i64_smallsra:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushr v1.2d, v0.2d, #63
+; CHECK-NEXT:    shrn v0.2s, v0.2d, #27
+; CHECK-NEXT:    xtn v1.2s, v1.2d
+; CHECK-NEXT:    add v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    ret
+entry:
+  %shr = lshr <2 x i64> %n, splat (i64 63)
+  %vmovn.i4 = trunc nuw nsw <2 x i64> %shr to <2 x i32>
+  %shr1 = ashr <2 x i64> %n, splat (i64 27)
+  %vmovn.i = trunc nsw <2 x i64> %shr1 to <2 x i32>
+  %add = add nsw <2 x i32> %vmovn.i4, %vmovn.i
+  ret <2 x i32> %add
+}
+

github-actions · 2025-11-25T17:05:05Z

🐧 Linux x64 Test Results

3053 tests passed
7 tests skipped

All tests passed but another part of the build failed. Click on a failure below to see the details.

lib/Target/AArch64/CMakeFiles/LLVMAArch64CodeGen.dir/AArch64ISelLowering.cpp.o

FAILED: lib/Target/AArch64/CMakeFiles/LLVMAArch64CodeGen.dir/AArch64ISelLowering.cpp.o
sccache /opt/llvm/bin/clang++ -DGTEST_HAS_RTTI=0 -D_DEBUG -D_GLIBCXX_ASSERTIONS -D_GLIBCXX_USE_CXX11_ABI=1 -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/home/gha/actions-runner/_work/llvm-project/llvm-project/build/lib/Target/AArch64 -I/home/gha/actions-runner/_work/llvm-project/llvm-project/llvm/lib/Target/AArch64 -I/home/gha/actions-runner/_work/llvm-project/llvm-project/build/include -I/home/gha/actions-runner/_work/llvm-project/llvm-project/llvm/include -gmlt -fPIC -fno-semantic-interposition -fvisibility-inlines-hidden -Werror -Werror=date-time -Werror=unguarded-availability-new -Wall -Wextra -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wmissing-field-initializers -pedantic -Wno-long-long -Wc++98-compat-extra-semi -Wimplicit-fallthrough -Wcovered-switch-default -Wno-noexcept-type -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wsuggest-override -Wstring-conversion -Wno-pass-failed -Wmisleading-indentation -Wctad-maybe-unsupported -fdiagnostics-color -ffunction-sections -fdata-sections -O3 -DNDEBUG -std=c++17 -fvisibility=hidden  -fno-exceptions -funwind-tables -fno-rtti -UNDEBUG -MD -MT lib/Target/AArch64/CMakeFiles/LLVMAArch64CodeGen.dir/AArch64ISelLowering.cpp.o -MF lib/Target/AArch64/CMakeFiles/LLVMAArch64CodeGen.dir/AArch64ISelLowering.cpp.o.d -o lib/Target/AArch64/CMakeFiles/LLVMAArch64CodeGen.dir/AArch64ISelLowering.cpp.o -c /home/gha/actions-runner/_work/llvm-project/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
/home/gha/actions-runner/_work/llvm-project/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp:17794:65: error: call to 'm_Value' is ambiguous
17794 |             if (match(SingleUser, m_c_Mul(m_Specific(I), m_SExt(m_Value()))))
|                                                                 ^~~~~~~
/home/gha/actions-runner/_work/llvm-project/llvm-project/llvm/include/llvm/CodeGen/SDPatternMatch.h:104:20: note: candidate function
104 | inline Value_match m_Value() { return Value_match(); }
|                    ^
/home/gha/actions-runner/_work/llvm-project/llvm-project/llvm/include/llvm/IR/PatternMatch.h:105:27: note: candidate function
105 | inline class_match<Value> m_Value() { return class_match<Value>(); }
|                           ^
/home/gha/actions-runner/_work/llvm-project/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp:17798:27: error: call to 'm_Value' is ambiguous
17798 |                           m_Value(), m_Specific(I))))
|                           ^~~~~~~
/home/gha/actions-runner/_work/llvm-project/llvm-project/llvm/include/llvm/CodeGen/SDPatternMatch.h:104:20: note: candidate function
104 | inline Value_match m_Value() { return Value_match(); }
|                    ^
/home/gha/actions-runner/_work/llvm-project/llvm-project/llvm/include/llvm/IR/PatternMatch.h:105:27: note: candidate function
105 | inline class_match<Value> m_Value() { return class_match<Value>(); }
|                           ^
/home/gha/actions-runner/_work/llvm-project/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp:18065:67: error: call to 'm_Value' is ambiguous
18065 |         return SI->hasOneUse() && match(SI->user_back(), m_UIToFP(m_Value())) &&
|                                                                   ^~~~~~~
/home/gha/actions-runner/_work/llvm-project/llvm-project/llvm/include/llvm/CodeGen/SDPatternMatch.h:104:20: note: candidate function
104 | inline Value_match m_Value() { return Value_match(); }
|                    ^
/home/gha/actions-runner/_work/llvm-project/llvm-project/llvm/include/llvm/IR/PatternMatch.h:105:27: note: candidate function
105 | inline class_match<Value> m_Value() { return class_match<Value>(); }
|                           ^
3 errors generated.

If these failures are unrelated to your changes (for example tests are broken or flaky at HEAD), please open an issue at https://github.com/llvm/llvm-project/issues and add the infrastructure label.

SamTebbs33

LGTM

SamTebbs33 · 2025-11-25T17:27:46Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

                     Flags);
 }

+// add(trunc(ashr(A, C)), trunc(lshr(B, BW-1))), with C >= BW


Should B here be A? That seems to be what the code suggests.

usha1830

LGTM, Thanks.

usha1830 · 2025-11-25T17:44:30Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+// The original converts into ashr+lshr+xtn+xtn+add. The second becomes
+// ashr+xtn+usra. The first form has less total latency due to more parallelism,
+// but more micro-ops and seems to be slower in practice.
+static SDValue performAddTrunkShiftCombine(SDNode *N, SelectionDAG &DAG) {


Suggested change

static SDValue performAddTrunkShiftCombine(SDNode *N, SelectionDAG &DAG) {

static SDValue performAddTruncShiftCombine(SDNode *N, SelectionDAG &DAG) {

This adds a combine for add(trunc(ashr(A, C)), trunc(lshr(A, BW-1))), with C >= BW -> X = trunc(ashr(A, C)); add(x, lshr(X, BW-1) The original converts into ashr+lshr+xtn+xtn+add. The second becomes ashr+xtn+usra. The first form has less total latency due to more parallelism, but more micro-ops and seems to be slower in practice.

davemgreen requested review from SamTebbs33, hassnaaHamdi, nasherm, sjoerdmeijer and usha1830 November 25, 2025 16:54

llvmbot added the backend:AArch64 label Nov 25, 2025

SamTebbs33 approved these changes Nov 25, 2025

View reviewed changes

usha1830 approved these changes Nov 25, 2025

View reviewed changes

davemgreen force-pushed the gh-a64-addtruncshiftvec branch from 1faa946 to 9ea52af Compare November 26, 2025 10:31

davemgreen added 2 commits November 26, 2025 11:16

Fix clang build

d01e4cc

davemgreen force-pushed the gh-a64-addtruncshiftvec branch from 9ea52af to d01e4cc Compare November 26, 2025 13:51

davemgreen enabled auto-merge (squash) November 26, 2025 14:01

davemgreen merged commit b20d35c into llvm:main Nov 26, 2025
7 of 9 checks passed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[AArch64] Combine vector add(trunc(shift)) #169523

[AArch64] Combine vector add(trunc(shift)) #169523

Uh oh!

davemgreen commented Nov 25, 2025 •

edited

Loading

Uh oh!

llvmbot commented Nov 25, 2025

Uh oh!

github-actions bot commented Nov 25, 2025

Uh oh!

SamTebbs33 left a comment

Uh oh!

SamTebbs33 Nov 25, 2025

Uh oh!

usha1830 left a comment

Uh oh!

usha1830 Nov 25, 2025

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

4 participants

	static SDValue performAddTrunkShiftCombine(SDNode *N, SelectionDAG &DAG) {
	static SDValue performAddTruncShiftCombine(SDNode *N, SelectionDAG &DAG) {

[AArch64] Combine vector add(trunc(shift)) #169523

[AArch64] Combine vector add(trunc(shift)) #169523

Uh oh!

Conversation

davemgreen commented Nov 25, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Nov 25, 2025

Uh oh!

github-actions bot commented Nov 25, 2025

🐧 Linux x64 Test Results

Uh oh!

SamTebbs33 left a comment

Choose a reason for hiding this comment

Uh oh!

SamTebbs33 Nov 25, 2025

Choose a reason for hiding this comment

Uh oh!

usha1830 left a comment

Choose a reason for hiding this comment

Uh oh!

usha1830 Nov 25, 2025

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

4 participants

davemgreen commented Nov 25, 2025 •

edited

Loading