Skip to content

Commit 5019d80

Browse files
author
git apple-llvm automerger
committed
Merge commit 'de4b458aa5e5' from llvm.org/main into next
2 parents a1721d4 + de4b458 commit 5019d80

File tree

3 files changed

+340
-2
lines changed

3 files changed

+340
-2
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 107 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1143,6 +1143,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
11431143
ISD::SIGN_EXTEND_INREG, ISD::CONCAT_VECTORS,
11441144
ISD::EXTRACT_SUBVECTOR, ISD::INSERT_SUBVECTOR,
11451145
ISD::STORE, ISD::BUILD_VECTOR});
1146+
setTargetDAGCombine(ISD::SMIN);
11461147
setTargetDAGCombine(ISD::TRUNCATE);
11471148
setTargetDAGCombine(ISD::LOAD);
11481149

@@ -2392,6 +2393,15 @@ static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
23922393
return false;
23932394
}
23942395

2396+
bool isVectorizedBinOp(unsigned Opcode) {
2397+
switch (Opcode) {
2398+
case AArch64ISD::SQDMULH:
2399+
return true;
2400+
default:
2401+
return false;
2402+
}
2403+
}
2404+
23952405
// isOpcWithIntImmediate - This method tests to see if the node is a specific
23962406
// opcode and that it has a immediate integer right operand.
23972407
// If so Imm will receive the value.
@@ -20165,8 +20175,9 @@ static SDValue performConcatVectorsCombine(SDNode *N,
2016520175
// size, combine into an binop of two contacts of the source vectors. eg:
2016620176
// concat(uhadd(a,b), uhadd(c, d)) -> uhadd(concat(a, c), concat(b, d))
2016720177
if (N->getNumOperands() == 2 && N0Opc == N1Opc && VT.is128BitVector() &&
20168-
DAG.getTargetLoweringInfo().isBinOp(N0Opc) && N0->hasOneUse() &&
20169-
N1->hasOneUse()) {
20178+
(DAG.getTargetLoweringInfo().isBinOp(N0Opc) ||
20179+
isVectorizedBinOp(N0Opc)) &&
20180+
N0->hasOneUse() && N1->hasOneUse()) {
2017020181
SDValue N00 = N0->getOperand(0);
2017120182
SDValue N01 = N0->getOperand(1);
2017220183
SDValue N10 = N1->getOperand(0);
@@ -21025,6 +21036,98 @@ static SDValue performBuildVectorCombine(SDNode *N,
2102521036
return SDValue();
2102621037
}
2102721038

21039+
// A special combine for the sqdmulh family of instructions.
21040+
// smin( sra ( mul( sext v0, sext v1 ) ), SHIFT_AMOUNT ),
21041+
// SATURATING_VAL ) can be reduced to sqdmulh(...)
21042+
static SDValue trySQDMULHCombine(SDNode *N, SelectionDAG &DAG) {
21043+
21044+
if (N->getOpcode() != ISD::SMIN)
21045+
return SDValue();
21046+
21047+
EVT DestVT = N->getValueType(0);
21048+
21049+
if (!DestVT.isVector() || DestVT.getScalarSizeInBits() > 64 ||
21050+
DestVT.isScalableVector())
21051+
return SDValue();
21052+
21053+
ConstantSDNode *Clamp = isConstOrConstSplat(N->getOperand(1));
21054+
21055+
if (!Clamp)
21056+
return SDValue();
21057+
21058+
MVT ScalarType;
21059+
unsigned ShiftAmt = 0;
21060+
switch (Clamp->getSExtValue()) {
21061+
case (1ULL << 15) - 1:
21062+
ScalarType = MVT::i16;
21063+
ShiftAmt = 16;
21064+
break;
21065+
case (1ULL << 31) - 1:
21066+
ScalarType = MVT::i32;
21067+
ShiftAmt = 32;
21068+
break;
21069+
default:
21070+
return SDValue();
21071+
}
21072+
21073+
SDValue Sra = N->getOperand(0);
21074+
if (Sra.getOpcode() != ISD::SRA || !Sra.hasOneUse())
21075+
return SDValue();
21076+
21077+
ConstantSDNode *RightShiftVec = isConstOrConstSplat(Sra.getOperand(1));
21078+
if (!RightShiftVec)
21079+
return SDValue();
21080+
unsigned SExtValue = RightShiftVec->getSExtValue();
21081+
21082+
if (SExtValue != (ShiftAmt - 1))
21083+
return SDValue();
21084+
21085+
SDValue Mul = Sra.getOperand(0);
21086+
if (Mul.getOpcode() != ISD::MUL)
21087+
return SDValue();
21088+
21089+
SDValue SExt0 = Mul.getOperand(0);
21090+
SDValue SExt1 = Mul.getOperand(1);
21091+
21092+
if (SExt0.getOpcode() != ISD::SIGN_EXTEND ||
21093+
SExt1.getOpcode() != ISD::SIGN_EXTEND)
21094+
return SDValue();
21095+
21096+
EVT SExt0Type = SExt0.getOperand(0).getValueType();
21097+
EVT SExt1Type = SExt1.getOperand(0).getValueType();
21098+
21099+
if (SExt0Type != SExt1Type || SExt0Type.getScalarType() != ScalarType ||
21100+
SExt0Type.getFixedSizeInBits() > 128 || !SExt0Type.isPow2VectorType() ||
21101+
SExt0Type.getVectorNumElements() == 1)
21102+
return SDValue();
21103+
21104+
SDLoc DL(N);
21105+
SDValue V0 = SExt0.getOperand(0);
21106+
SDValue V1 = SExt1.getOperand(0);
21107+
21108+
// Ensure input vectors are extended to legal types
21109+
if (SExt0Type.getFixedSizeInBits() < 64) {
21110+
unsigned VecNumElements = SExt0Type.getVectorNumElements();
21111+
EVT ExtVecVT = MVT::getVectorVT(MVT::getIntegerVT(64 / VecNumElements),
21112+
VecNumElements);
21113+
V0 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVecVT, V0);
21114+
V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVecVT, V1);
21115+
}
21116+
21117+
SDValue SQDMULH =
21118+
DAG.getNode(AArch64ISD::SQDMULH, DL, V0.getValueType(), V0, V1);
21119+
21120+
return DAG.getNode(ISD::SIGN_EXTEND, DL, DestVT, SQDMULH);
21121+
}
21122+
21123+
static SDValue performSMINCombine(SDNode *N, SelectionDAG &DAG) {
21124+
if (SDValue V = trySQDMULHCombine(N, DAG)) {
21125+
return V;
21126+
}
21127+
21128+
return SDValue();
21129+
}
21130+
2102821131
static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG,
2102921132
TargetLowering::DAGCombinerInfo &DCI) {
2103021133
SDLoc DL(N);
@@ -26776,6 +26879,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
2677626879
return performAddSubCombine(N, DCI);
2677726880
case ISD::BUILD_VECTOR:
2677826881
return performBuildVectorCombine(N, DCI, DAG);
26882+
case ISD::SMIN:
26883+
return performSMINCombine(N, DAG);
2677926884
case ISD::TRUNCATE:
2678026885
return performTruncateCombine(N, DAG, DCI);
2678126886
case AArch64ISD::ANDS:

llvm/lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1024,6 +1024,7 @@ def AArch64smull : SDNode<"AArch64ISD::SMULL", SDT_AArch64mull,
10241024
[SDNPCommutative]>;
10251025
def AArch64umull : SDNode<"AArch64ISD::UMULL", SDT_AArch64mull,
10261026
[SDNPCommutative]>;
1027+
def AArch64sqdmulh : SDNode<"AArch64ISD::SQDMULH", SDT_AArch64mull>;
10271028

10281029
// Reciprocal estimates and steps.
10291030
def AArch64frecpe : SDNode<"AArch64ISD::FRECPE", SDTFPUnaryOp>;
@@ -9451,6 +9452,15 @@ def : Pat<(v4i32 (mulhu V128:$Rn, V128:$Rm)),
94519452
(EXTRACT_SUBREG V128:$Rm, dsub)),
94529453
(UMULLv4i32_v2i64 V128:$Rn, V128:$Rm))>;
94539454

9455+
def : Pat<(v4i16 (AArch64sqdmulh (v4i16 V64:$Rn), (v4i16 V64:$Rm))),
9456+
(SQDMULHv4i16 V64:$Rn, V64:$Rm)>;
9457+
def : Pat<(v2i32 (AArch64sqdmulh (v2i32 V64:$Rn), (v2i32 V64:$Rm))),
9458+
(SQDMULHv2i32 V64:$Rn, V64:$Rm)>;
9459+
def : Pat<(v8i16 (AArch64sqdmulh (v8i16 V128:$Rn), (v8i16 V128:$Rm))),
9460+
(SQDMULHv8i16 V128:$Rn, V128:$Rm)>;
9461+
def : Pat<(v4i32 (AArch64sqdmulh (v4i32 V128:$Rn), (v4i32 V128:$Rm))),
9462+
(SQDMULHv4i32 V128:$Rn, V128:$Rm)>;
9463+
94549464
// Conversions within AdvSIMD types in the same register size are free.
94559465
// But because we need a consistent lane ordering, in big endian many
94569466
// conversions require one or more REV instructions.
Lines changed: 223 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,223 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=aarch64-none-elf < %s | FileCheck %s
3+
4+
5+
define <2 x i16> @saturating_2xi16(<2 x i16> %a, <2 x i16> %b) {
6+
; CHECK-LABEL: saturating_2xi16:
7+
; CHECK: // %bb.0:
8+
; CHECK-NEXT: shl v0.2s, v0.2s, #16
9+
; CHECK-NEXT: shl v1.2s, v1.2s, #16
10+
; CHECK-NEXT: sshr v0.2s, v0.2s, #16
11+
; CHECK-NEXT: sshr v1.2s, v1.2s, #16
12+
; CHECK-NEXT: sqdmulh v0.2s, v1.2s, v0.2s
13+
; CHECK-NEXT: ret
14+
%as = sext <2 x i16> %a to <2 x i32>
15+
%bs = sext <2 x i16> %b to <2 x i32>
16+
%m = mul <2 x i32> %bs, %as
17+
%sh = ashr <2 x i32> %m, splat (i32 15)
18+
%ma = tail call <2 x i32> @llvm.smin.v4i32(<2 x i32> %sh, <2 x i32> splat (i32 32767))
19+
%t = trunc <2 x i32> %ma to <2 x i16>
20+
ret <2 x i16> %t
21+
}
22+
23+
define <4 x i16> @saturating_4xi16(<4 x i16> %a, <4 x i16> %b) {
24+
; CHECK-LABEL: saturating_4xi16:
25+
; CHECK: // %bb.0:
26+
; CHECK-NEXT: sqdmulh v0.4h, v1.4h, v0.4h
27+
; CHECK-NEXT: ret
28+
%as = sext <4 x i16> %a to <4 x i32>
29+
%bs = sext <4 x i16> %b to <4 x i32>
30+
%m = mul <4 x i32> %bs, %as
31+
%sh = ashr <4 x i32> %m, splat (i32 15)
32+
%ma = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> %sh, <4 x i32> splat (i32 32767))
33+
%t = trunc <4 x i32> %ma to <4 x i16>
34+
ret <4 x i16> %t
35+
}
36+
37+
define <8 x i16> @saturating_8xi16(<8 x i16> %a, <8 x i16> %b) {
38+
; CHECK-LABEL: saturating_8xi16:
39+
; CHECK: // %bb.0:
40+
; CHECK-NEXT: sqdmulh v0.8h, v1.8h, v0.8h
41+
; CHECK-NEXT: ret
42+
%as = sext <8 x i16> %a to <8 x i32>
43+
%bs = sext <8 x i16> %b to <8 x i32>
44+
%m = mul <8 x i32> %bs, %as
45+
%sh = ashr <8 x i32> %m, splat (i32 15)
46+
%ma = tail call <8 x i32> @llvm.smin.v8i32(<8 x i32> %sh, <8 x i32> splat (i32 32767))
47+
%t = trunc <8 x i32> %ma to <8 x i16>
48+
ret <8 x i16> %t
49+
}
50+
51+
define <2 x i32> @saturating_2xi32(<2 x i32> %a, <2 x i32> %b) {
52+
; CHECK-LABEL: saturating_2xi32:
53+
; CHECK: // %bb.0:
54+
; CHECK-NEXT: sqdmulh v0.2s, v1.2s, v0.2s
55+
; CHECK-NEXT: ret
56+
%as = sext <2 x i32> %a to <2 x i64>
57+
%bs = sext <2 x i32> %b to <2 x i64>
58+
%m = mul <2 x i64> %bs, %as
59+
%sh = ashr <2 x i64> %m, splat (i64 31)
60+
%ma = tail call <2 x i64> @llvm.smin.v8i64(<2 x i64> %sh, <2 x i64> splat (i64 2147483647))
61+
%t = trunc <2 x i64> %ma to <2 x i32>
62+
ret <2 x i32> %t
63+
}
64+
65+
define <4 x i32> @saturating_4xi32(<4 x i32> %a, <4 x i32> %b) {
66+
; CHECK-LABEL: saturating_4xi32:
67+
; CHECK: // %bb.0:
68+
; CHECK-NEXT: sqdmulh v0.4s, v1.4s, v0.4s
69+
; CHECK-NEXT: ret
70+
%as = sext <4 x i32> %a to <4 x i64>
71+
%bs = sext <4 x i32> %b to <4 x i64>
72+
%m = mul <4 x i64> %bs, %as
73+
%sh = ashr <4 x i64> %m, splat (i64 31)
74+
%ma = tail call <4 x i64> @llvm.smin.v4i64(<4 x i64> %sh, <4 x i64> splat (i64 2147483647))
75+
%t = trunc <4 x i64> %ma to <4 x i32>
76+
ret <4 x i32> %t
77+
}
78+
79+
define <8 x i32> @saturating_8xi32(<8 x i32> %a, <8 x i32> %b) {
80+
; CHECK-LABEL: saturating_8xi32:
81+
; CHECK: // %bb.0:
82+
; CHECK-NEXT: sqdmulh v1.4s, v3.4s, v1.4s
83+
; CHECK-NEXT: sqdmulh v0.4s, v2.4s, v0.4s
84+
; CHECK-NEXT: ret
85+
%as = sext <8 x i32> %a to <8 x i64>
86+
%bs = sext <8 x i32> %b to <8 x i64>
87+
%m = mul <8 x i64> %bs, %as
88+
%sh = ashr <8 x i64> %m, splat (i64 31)
89+
%ma = tail call <8 x i64> @llvm.smin.v8i64(<8 x i64> %sh, <8 x i64> splat (i64 2147483647))
90+
%t = trunc <8 x i64> %ma to <8 x i32>
91+
ret <8 x i32> %t
92+
}
93+
94+
define <2 x i64> @saturating_2xi32_2xi64(<2 x i32> %a, <2 x i32> %b) {
95+
; CHECK-LABEL: saturating_2xi32_2xi64:
96+
; CHECK: // %bb.0:
97+
; CHECK-NEXT: sqdmulh v0.2s, v1.2s, v0.2s
98+
; CHECK-NEXT: sshll v0.2d, v0.2s, #0
99+
; CHECK-NEXT: ret
100+
%as = sext <2 x i32> %a to <2 x i64>
101+
%bs = sext <2 x i32> %b to <2 x i64>
102+
%m = mul <2 x i64> %bs, %as
103+
%sh = ashr <2 x i64> %m, splat (i64 31)
104+
%ma = tail call <2 x i64> @llvm.smin.v8i64(<2 x i64> %sh, <2 x i64> splat (i64 2147483647))
105+
ret <2 x i64> %ma
106+
}
107+
108+
define <6 x i16> @saturating_6xi16(<6 x i16> %a, <6 x i16> %b) {
109+
; CHECK-LABEL: saturating_6xi16:
110+
; CHECK: // %bb.0:
111+
; CHECK-NEXT: smull2 v3.4s, v1.8h, v0.8h
112+
; CHECK-NEXT: movi v2.4s, #127, msl #8
113+
; CHECK-NEXT: sqdmulh v0.4h, v1.4h, v0.4h
114+
; CHECK-NEXT: sshr v3.4s, v3.4s, #15
115+
; CHECK-NEXT: smin v2.4s, v3.4s, v2.4s
116+
; CHECK-NEXT: xtn2 v0.8h, v2.4s
117+
; CHECK-NEXT: ret
118+
%as = sext <6 x i16> %a to <6 x i32>
119+
%bs = sext <6 x i16> %b to <6 x i32>
120+
%m = mul <6 x i32> %bs, %as
121+
%sh = ashr <6 x i32> %m, splat (i32 15)
122+
%ma = tail call <6 x i32> @llvm.smin.v6i32(<6 x i32> %sh, <6 x i32> splat (i32 32767))
123+
%t = trunc <6 x i32> %ma to <6 x i16>
124+
ret <6 x i16> %t
125+
}
126+
127+
define <4 x i16> @unsupported_saturation_value_v4i16(<4 x i16> %a, <4 x i16> %b) {
128+
; CHECK-LABEL: unsupported_saturation_value_v4i16:
129+
; CHECK: // %bb.0:
130+
; CHECK-NEXT: smull v0.4s, v1.4h, v0.4h
131+
; CHECK-NEXT: movi v1.4s, #42
132+
; CHECK-NEXT: sshr v0.4s, v0.4s, #15
133+
; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s
134+
; CHECK-NEXT: xtn v0.4h, v0.4s
135+
; CHECK-NEXT: ret
136+
%as = sext <4 x i16> %a to <4 x i32>
137+
%bs = sext <4 x i16> %b to <4 x i32>
138+
%m = mul <4 x i32> %bs, %as
139+
%sh = ashr <4 x i32> %m, splat (i32 15)
140+
%ma = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> %sh, <4 x i32> splat (i32 42))
141+
%t = trunc <4 x i32> %ma to <4 x i16>
142+
ret <4 x i16> %t
143+
}
144+
145+
define <4 x i16> @unsupported_shift_value_v4i16(<4 x i16> %a, <4 x i16> %b) {
146+
; CHECK-LABEL: unsupported_shift_value_v4i16:
147+
; CHECK: // %bb.0:
148+
; CHECK-NEXT: smull v0.4s, v1.4h, v0.4h
149+
; CHECK-NEXT: movi v1.4s, #127, msl #8
150+
; CHECK-NEXT: sshr v0.4s, v0.4s, #3
151+
; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s
152+
; CHECK-NEXT: xtn v0.4h, v0.4s
153+
; CHECK-NEXT: ret
154+
%as = sext <4 x i16> %a to <4 x i32>
155+
%bs = sext <4 x i16> %b to <4 x i32>
156+
%m = mul <4 x i32> %bs, %as
157+
%sh = ashr <4 x i32> %m, splat (i32 3)
158+
%ma = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> %sh, <4 x i32> splat (i32 32767))
159+
%t = trunc <4 x i32> %ma to <4 x i16>
160+
ret <4 x i16> %t
161+
}
162+
163+
define <2 x i16> @extend_to_illegal_type(<2 x i16> %a, <2 x i16> %b) {
164+
; CHECK-LABEL: extend_to_illegal_type:
165+
; CHECK: // %bb.0:
166+
; CHECK-NEXT: shl v0.2s, v0.2s, #16
167+
; CHECK-NEXT: shl v1.2s, v1.2s, #16
168+
; CHECK-NEXT: sshr v0.2s, v0.2s, #16
169+
; CHECK-NEXT: sshr v1.2s, v1.2s, #16
170+
; CHECK-NEXT: sqdmulh v0.2s, v1.2s, v0.2s
171+
; CHECK-NEXT: ret
172+
%as = sext <2 x i16> %a to <2 x i48>
173+
%bs = sext <2 x i16> %b to <2 x i48>
174+
%m = mul <2 x i48> %bs, %as
175+
%sh = ashr <2 x i48> %m, splat (i48 15)
176+
%ma = tail call <2 x i48> @llvm.smin.v4i32(<2 x i48> %sh, <2 x i48> splat (i48 32767))
177+
%t = trunc <2 x i48> %ma to <2 x i16>
178+
ret <2 x i16> %t
179+
}
180+
181+
define <2 x i11> @illegal_source(<2 x i11> %a, <2 x i11> %b) {
182+
; CHECK-LABEL: illegal_source:
183+
; CHECK: // %bb.0:
184+
; CHECK-NEXT: shl v0.2s, v0.2s, #21
185+
; CHECK-NEXT: shl v1.2s, v1.2s, #21
186+
; CHECK-NEXT: sshr v0.2s, v0.2s, #21
187+
; CHECK-NEXT: sshr v1.2s, v1.2s, #21
188+
; CHECK-NEXT: mul v0.2s, v1.2s, v0.2s
189+
; CHECK-NEXT: movi v1.2s, #127, msl #8
190+
; CHECK-NEXT: sshr v0.2s, v0.2s, #15
191+
; CHECK-NEXT: smin v0.2s, v0.2s, v1.2s
192+
; CHECK-NEXT: ret
193+
%as = sext <2 x i11> %a to <2 x i32>
194+
%bs = sext <2 x i11> %b to <2 x i32>
195+
%m = mul <2 x i32> %bs, %as
196+
%sh = ashr <2 x i32> %m, splat (i32 15)
197+
%ma = tail call <2 x i32> @llvm.smin.v2i32(<2 x i32> %sh, <2 x i32> splat (i32 32767))
198+
%t = trunc <2 x i32> %ma to <2 x i11>
199+
ret <2 x i11> %t
200+
}
201+
define <1 x i16> @saturating_1xi16(<1 x i16> %a, <1 x i16> %b) {
202+
; CHECK-LABEL: saturating_1xi16:
203+
; CHECK: // %bb.0:
204+
; CHECK-NEXT: zip1 v0.4h, v0.4h, v0.4h
205+
; CHECK-NEXT: zip1 v1.4h, v1.4h, v0.4h
206+
; CHECK-NEXT: shl v0.2s, v0.2s, #16
207+
; CHECK-NEXT: sshr v0.2s, v0.2s, #16
208+
; CHECK-NEXT: shl v1.2s, v1.2s, #16
209+
; CHECK-NEXT: sshr v1.2s, v1.2s, #16
210+
; CHECK-NEXT: mul v0.2s, v1.2s, v0.2s
211+
; CHECK-NEXT: movi v1.2s, #127, msl #8
212+
; CHECK-NEXT: sshr v0.2s, v0.2s, #15
213+
; CHECK-NEXT: smin v0.2s, v0.2s, v1.2s
214+
; CHECK-NEXT: uzp1 v0.4h, v0.4h, v0.4h
215+
; CHECK-NEXT: ret
216+
%as = sext <1 x i16> %a to <1 x i32>
217+
%bs = sext <1 x i16> %b to <1 x i32>
218+
%m = mul <1 x i32> %bs, %as
219+
%sh = ashr <1 x i32> %m, splat (i32 15)
220+
%ma = tail call <1 x i32> @llvm.smin.v1i32(<1 x i32> %sh, <1 x i32> splat (i32 32767))
221+
%t = trunc <1 x i32> %ma to <1 x i16>
222+
ret <1 x i16> %t
223+
}

0 commit comments

Comments
 (0)