Skip to content

Commit f52550a

Browse files
davemgreen丹治秀樹
authored andcommitted
[AArch64] Combine vector add(trunc(shift)) (llvm#169523)
This adds a combine for add(trunc(ashr(A, C)), trunc(lshr(A, BW-1))), with C >= BW -> X = trunc(ashr(A, C)); add(x, lshr(X, BW-1) The original converts into ashr+lshr+xtn+xtn+add. The second becomes ashr+xtn+usra. The first form has less total latency due to more parallelism, but more micro-ops and seems to be slower in practice.
1 parent 836a319 commit f52550a

File tree

2 files changed

+79
-27
lines changed

2 files changed

+79
-27
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22586,6 +22586,38 @@ static SDValue performSubWithBorrowCombine(SDNode *N, SelectionDAG &DAG) {
2258622586
Flags);
2258722587
}
2258822588

22589+
// add(trunc(ashr(A, C)), trunc(lshr(A, BW-1))), with C >= BW
22590+
// ->
22591+
// X = trunc(ashr(A, C)); add(x, lshr(X, BW-1)
22592+
// The original converts into ashr+lshr+xtn+xtn+add. The second becomes
22593+
// ashr+xtn+usra. The first form has less total latency due to more parallelism,
22594+
// but more micro-ops and seems to be slower in practice.
22595+
static SDValue performAddTruncShiftCombine(SDNode *N, SelectionDAG &DAG) {
22596+
using namespace llvm::SDPatternMatch;
22597+
EVT VT = N->getValueType(0);
22598+
if (VT != MVT::v2i32 && VT != MVT::v4i16 && VT != MVT::v8i8)
22599+
return SDValue();
22600+
22601+
SDValue AShr, LShr;
22602+
if (!sd_match(N, m_Add(m_Trunc(m_Value(AShr)), m_Trunc(m_Value(LShr)))))
22603+
return SDValue();
22604+
if (AShr.getOpcode() != AArch64ISD::VASHR)
22605+
std::swap(AShr, LShr);
22606+
if (AShr.getOpcode() != AArch64ISD::VASHR ||
22607+
LShr.getOpcode() != AArch64ISD::VLSHR ||
22608+
AShr.getOperand(0) != LShr.getOperand(0) ||
22609+
AShr.getConstantOperandVal(1) < VT.getScalarSizeInBits() ||
22610+
LShr.getConstantOperandVal(1) != VT.getScalarSizeInBits() * 2 - 1)
22611+
return SDValue();
22612+
22613+
SDLoc DL(N);
22614+
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, AShr);
22615+
SDValue Shift = DAG.getNode(
22616+
AArch64ISD::VLSHR, DL, VT, Trunc,
22617+
DAG.getTargetConstant(VT.getScalarSizeInBits() - 1, DL, MVT::i32));
22618+
return DAG.getNode(ISD::ADD, DL, VT, Trunc, Shift);
22619+
}
22620+
2258922621
static SDValue performAddSubCombine(SDNode *N,
2259022622
TargetLowering::DAGCombinerInfo &DCI) {
2259122623
// Try to change sum of two reductions.
@@ -22609,6 +22641,8 @@ static SDValue performAddSubCombine(SDNode *N,
2260922641
return Val;
2261022642
if (SDValue Val = performSubWithBorrowCombine(N, DCI.DAG))
2261122643
return Val;
22644+
if (SDValue Val = performAddTruncShiftCombine(N, DCI.DAG))
22645+
return Val;
2261222646

2261322647
if (SDValue Val = performExtBinopLoadFold(N, DCI.DAG))
2261422648
return Val;

llvm/test/CodeGen/AArch64/addtruncshift.ll

Lines changed: 45 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,21 @@
33
; RUN: llc -mtriple=aarch64-none-elf -global-isel < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI
44

55
define <2 x i32> @test_v2i64(<2 x i64> %n) {
6-
; CHECK-LABEL: test_v2i64:
7-
; CHECK: // %bb.0: // %entry
8-
; CHECK-NEXT: ushr v1.2d, v0.2d, #63
9-
; CHECK-NEXT: sshr v0.2d, v0.2d, #35
10-
; CHECK-NEXT: xtn v1.2s, v1.2d
11-
; CHECK-NEXT: xtn v0.2s, v0.2d
12-
; CHECK-NEXT: add v0.2s, v1.2s, v0.2s
13-
; CHECK-NEXT: ret
6+
; CHECK-SD-LABEL: test_v2i64:
7+
; CHECK-SD: // %bb.0: // %entry
8+
; CHECK-SD-NEXT: sshr v0.2d, v0.2d, #35
9+
; CHECK-SD-NEXT: xtn v0.2s, v0.2d
10+
; CHECK-SD-NEXT: usra v0.2s, v0.2s, #31
11+
; CHECK-SD-NEXT: ret
12+
;
13+
; CHECK-GI-LABEL: test_v2i64:
14+
; CHECK-GI: // %bb.0: // %entry
15+
; CHECK-GI-NEXT: ushr v1.2d, v0.2d, #63
16+
; CHECK-GI-NEXT: sshr v0.2d, v0.2d, #35
17+
; CHECK-GI-NEXT: xtn v1.2s, v1.2d
18+
; CHECK-GI-NEXT: xtn v0.2s, v0.2d
19+
; CHECK-GI-NEXT: add v0.2s, v1.2s, v0.2s
20+
; CHECK-GI-NEXT: ret
1421
entry:
1522
%shr = lshr <2 x i64> %n, splat (i64 63)
1623
%vmovn.i4 = trunc nuw nsw <2 x i64> %shr to <2 x i32>
@@ -21,14 +28,21 @@ entry:
2128
}
2229

2330
define <4 x i16> @test_v4i32(<4 x i32> %n) {
24-
; CHECK-LABEL: test_v4i32:
25-
; CHECK: // %bb.0: // %entry
26-
; CHECK-NEXT: ushr v1.4s, v0.4s, #31
27-
; CHECK-NEXT: sshr v0.4s, v0.4s, #17
28-
; CHECK-NEXT: xtn v1.4h, v1.4s
29-
; CHECK-NEXT: xtn v0.4h, v0.4s
30-
; CHECK-NEXT: add v0.4h, v1.4h, v0.4h
31-
; CHECK-NEXT: ret
31+
; CHECK-SD-LABEL: test_v4i32:
32+
; CHECK-SD: // %bb.0: // %entry
33+
; CHECK-SD-NEXT: sshr v0.4s, v0.4s, #17
34+
; CHECK-SD-NEXT: xtn v0.4h, v0.4s
35+
; CHECK-SD-NEXT: usra v0.4h, v0.4h, #15
36+
; CHECK-SD-NEXT: ret
37+
;
38+
; CHECK-GI-LABEL: test_v4i32:
39+
; CHECK-GI: // %bb.0: // %entry
40+
; CHECK-GI-NEXT: ushr v1.4s, v0.4s, #31
41+
; CHECK-GI-NEXT: sshr v0.4s, v0.4s, #17
42+
; CHECK-GI-NEXT: xtn v1.4h, v1.4s
43+
; CHECK-GI-NEXT: xtn v0.4h, v0.4s
44+
; CHECK-GI-NEXT: add v0.4h, v1.4h, v0.4h
45+
; CHECK-GI-NEXT: ret
3246
entry:
3347
%shr = lshr <4 x i32> %n, splat (i32 31)
3448
%vmovn.i4 = trunc nuw nsw <4 x i32> %shr to <4 x i16>
@@ -39,14 +53,21 @@ entry:
3953
}
4054

4155
define <8 x i8> @test_v8i16(<8 x i16> %n) {
42-
; CHECK-LABEL: test_v8i16:
43-
; CHECK: // %bb.0: // %entry
44-
; CHECK-NEXT: ushr v1.8h, v0.8h, #15
45-
; CHECK-NEXT: sshr v0.8h, v0.8h, #9
46-
; CHECK-NEXT: xtn v1.8b, v1.8h
47-
; CHECK-NEXT: xtn v0.8b, v0.8h
48-
; CHECK-NEXT: add v0.8b, v1.8b, v0.8b
49-
; CHECK-NEXT: ret
56+
; CHECK-SD-LABEL: test_v8i16:
57+
; CHECK-SD: // %bb.0: // %entry
58+
; CHECK-SD-NEXT: sshr v0.8h, v0.8h, #9
59+
; CHECK-SD-NEXT: xtn v0.8b, v0.8h
60+
; CHECK-SD-NEXT: usra v0.8b, v0.8b, #7
61+
; CHECK-SD-NEXT: ret
62+
;
63+
; CHECK-GI-LABEL: test_v8i16:
64+
; CHECK-GI: // %bb.0: // %entry
65+
; CHECK-GI-NEXT: ushr v1.8h, v0.8h, #15
66+
; CHECK-GI-NEXT: sshr v0.8h, v0.8h, #9
67+
; CHECK-GI-NEXT: xtn v1.8b, v1.8h
68+
; CHECK-GI-NEXT: xtn v0.8b, v0.8h
69+
; CHECK-GI-NEXT: add v0.8b, v1.8b, v0.8b
70+
; CHECK-GI-NEXT: ret
5071
entry:
5172
%shr = lshr <8 x i16> %n, splat (i16 15)
5273
%vmovn.i4 = trunc nuw nsw <8 x i16> %shr to <8 x i8>
@@ -91,6 +112,3 @@ entry:
91112
ret <2 x i32> %add
92113
}
93114

94-
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
95-
; CHECK-GI: {{.*}}
96-
; CHECK-SD: {{.*}}

0 commit comments

Comments
 (0)