Skip to content

Commit 2907c63

Browse files
author
Rin Dobrescu
authored
Revert "[AArch64] Convert concat(uhadd(a,b), uhadd(c,d)) to uhadd(concat(a,c), concat(b,d))" (#80157)
Reverts #79464 while figuring out why the tests are failing.
1 parent 98dbc68 commit 2907c63

File tree

3 files changed

+104
-155
lines changed

3 files changed

+104
-155
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 38 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18242,21 +18242,51 @@ static SDValue performConcatVectorsCombine(SDNode *N,
1824218242
if (DCI.isBeforeLegalizeOps())
1824318243
return SDValue();
1824418244

18245-
// Optimise concat_vectors of two [us]avgceils or [us]avgfloors with a 128-bit
18246-
// destination size, combine into an avg of two contacts of the source
18247-
// vectors. eg: concat(uhadd(a,b), uhadd(c, d)) -> uhadd(concat(a, c),
18248-
// concat(b, d))
18249-
if (N->getNumOperands() == 2 && N0Opc == N1Opc && VT.is128BitVector() &&
18245+
// Optimise concat_vectors of two [us]avgceils or [us]avgfloors that use
18246+
// extracted subvectors from the same original vectors. Combine these into a
18247+
// single avg that operates on the two original vectors.
18248+
// avgceil is the target independant name for rhadd, avgfloor is a hadd.
18249+
// Example:
18250+
// (concat_vectors (v8i8 (avgceils (extract_subvector (v16i8 OpA, <0>),
18251+
// extract_subvector (v16i8 OpB, <0>))),
18252+
// (v8i8 (avgceils (extract_subvector (v16i8 OpA, <8>),
18253+
// extract_subvector (v16i8 OpB, <8>)))))
18254+
// ->
18255+
// (v16i8(avgceils(v16i8 OpA, v16i8 OpB)))
18256+
if (N->getNumOperands() == 2 && N0Opc == N1Opc &&
1825018257
(N0Opc == ISD::AVGCEILU || N0Opc == ISD::AVGCEILS ||
1825118258
N0Opc == ISD::AVGFLOORU || N0Opc == ISD::AVGFLOORS)) {
1825218259
SDValue N00 = N0->getOperand(0);
1825318260
SDValue N01 = N0->getOperand(1);
1825418261
SDValue N10 = N1->getOperand(0);
1825518262
SDValue N11 = N1->getOperand(1);
1825618263

18257-
SDValue Concat0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N00, N10);
18258-
SDValue Concat1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N01, N11);
18259-
return DAG.getNode(N0Opc, dl, VT, Concat0, Concat1);
18264+
EVT N00VT = N00.getValueType();
18265+
EVT N10VT = N10.getValueType();
18266+
18267+
if (N00->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
18268+
N01->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
18269+
N10->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
18270+
N11->getOpcode() == ISD::EXTRACT_SUBVECTOR && N00VT == N10VT) {
18271+
SDValue N00Source = N00->getOperand(0);
18272+
SDValue N01Source = N01->getOperand(0);
18273+
SDValue N10Source = N10->getOperand(0);
18274+
SDValue N11Source = N11->getOperand(0);
18275+
18276+
if (N00Source == N10Source && N01Source == N11Source &&
18277+
N00Source.getValueType() == VT && N01Source.getValueType() == VT) {
18278+
assert(N0.getValueType() == N1.getValueType());
18279+
18280+
uint64_t N00Index = N00.getConstantOperandVal(1);
18281+
uint64_t N01Index = N01.getConstantOperandVal(1);
18282+
uint64_t N10Index = N10.getConstantOperandVal(1);
18283+
uint64_t N11Index = N11.getConstantOperandVal(1);
18284+
18285+
if (N00Index == N01Index && N10Index == N11Index && N00Index == 0 &&
18286+
N10Index == N00VT.getVectorNumElements())
18287+
return DAG.getNode(N0Opc, dl, VT, N00Source, N01Source);
18288+
}
18289+
}
1826018290
}
1826118291

1826218292
auto IsRSHRN = [](SDValue Shr) {

llvm/test/CodeGen/AArch64/avoid-pre-trunc.ll

Lines changed: 66 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,75 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
22
; RUN: llc -mtriple=aarch64 < %s | FileCheck %s
33

4+
define i32 @lower_lshr(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, <4 x i32> %f, <4 x i32> %g, <4 x i32> %h) {
5+
; CHECK-LABEL: lower_lshr:
6+
; CHECK: // %bb.0:
7+
; CHECK-NEXT: addv s0, v0.4s
8+
; CHECK-NEXT: addv s1, v1.4s
9+
; CHECK-NEXT: addv s4, v4.4s
10+
; CHECK-NEXT: addv s5, v5.4s
11+
; CHECK-NEXT: addv s2, v2.4s
12+
; CHECK-NEXT: addv s6, v6.4s
13+
; CHECK-NEXT: mov v0.s[1], v1.s[0]
14+
; CHECK-NEXT: addv s1, v3.4s
15+
; CHECK-NEXT: addv s3, v7.4s
16+
; CHECK-NEXT: mov v4.s[1], v5.s[0]
17+
; CHECK-NEXT: mov v0.s[2], v2.s[0]
18+
; CHECK-NEXT: mov v4.s[2], v6.s[0]
19+
; CHECK-NEXT: mov v0.s[3], v1.s[0]
20+
; CHECK-NEXT: mov v4.s[3], v3.s[0]
21+
; CHECK-NEXT: xtn v1.4h, v0.4s
22+
; CHECK-NEXT: shrn v0.4h, v0.4s, #16
23+
; CHECK-NEXT: xtn v2.4h, v4.4s
24+
; CHECK-NEXT: shrn v3.4h, v4.4s, #16
25+
; CHECK-NEXT: uhadd v0.4h, v1.4h, v0.4h
26+
; CHECK-NEXT: uhadd v1.4h, v2.4h, v3.4h
27+
; CHECK-NEXT: mov v0.d[1], v1.d[0]
28+
; CHECK-NEXT: uaddlv s0, v0.8h
29+
; CHECK-NEXT: fmov w0, s0
30+
; CHECK-NEXT: ret
31+
%l87 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
32+
%l174 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %b)
33+
%l257 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %c)
34+
%l340 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %d)
35+
%l427 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %e)
36+
%l514 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %f)
37+
%l597 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %g)
38+
%l680 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %h)
39+
%l681 = insertelement <8 x i32> poison, i32 %l87, i32 0
40+
%l682 = insertelement <8 x i32> %l681, i32 %l174, i32 1
41+
%l683 = insertelement <8 x i32> %l682, i32 %l257, i32 2
42+
%l684 = insertelement <8 x i32> %l683, i32 %l340, i32 3
43+
%l685 = insertelement <8 x i32> %l684, i32 %l427, i32 4
44+
%l686 = insertelement <8 x i32> %l685, i32 %l514, i32 5
45+
%l687 = insertelement <8 x i32> %l686, i32 %l597, i32 6
46+
%l688 = insertelement <8 x i32> %l687, i32 %l680, i32 7
47+
%l689 = and <8 x i32> %l688, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
48+
%l690 = lshr <8 x i32> %l688, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
49+
%l691 = add nuw nsw <8 x i32> %l689, %l690
50+
%l692 = lshr <8 x i32> %l691, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
51+
%l693 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %l692)
52+
ret i32 %l693
53+
}
54+
declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
55+
declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
56+
457
define <16 x i8> @lower_trunc_16xi8(i16 %a, i16 %b, i16 %c, i16 %d, i16 %e, i16 %f, i16 %g, i16 %h, i16 %i, i16 %j, i16 %k, i16 %l, i16 %m, i16 %n, i16 %o, i16 %p) {
558
; CHECK-LABEL: lower_trunc_16xi8:
659
; CHECK: // %bb.0:
760
; CHECK-NEXT: fmov s0, w0
8-
; CHECK-NEXT: ldr h1, [sp]
9-
; CHECK-NEXT: add x8, sp, #8
10-
; CHECK-NEXT: ld1 { v1.h }[1], [x8]
11-
; CHECK-NEXT: add x8, sp, #16
61+
; CHECK-NEXT: add x8, sp, #56
62+
; CHECK-NEXT: ld1r { v1.8h }, [x8]
1263
; CHECK-NEXT: mov v0.h[1], w1
13-
; CHECK-NEXT: ld1 { v1.h }[2], [x8]
14-
; CHECK-NEXT: add x8, sp, #24
64+
; CHECK-NEXT: add v3.8h, v1.8h, v1.8h
1565
; CHECK-NEXT: mov v0.h[2], w2
16-
; CHECK-NEXT: ld1 { v1.h }[3], [x8]
17-
; CHECK-NEXT: add x8, sp, #32
1866
; CHECK-NEXT: mov v0.h[3], w3
19-
; CHECK-NEXT: ld1 { v1.h }[4], [x8]
20-
; CHECK-NEXT: add x8, sp, #40
21-
; CHECK-NEXT: ld1 { v1.h }[5], [x8]
22-
; CHECK-NEXT: add x8, sp, #48
2367
; CHECK-NEXT: mov v0.h[4], w4
24-
; CHECK-NEXT: ld1 { v1.h }[6], [x8]
25-
; CHECK-NEXT: add x8, sp, #56
2668
; CHECK-NEXT: mov v0.h[5], w5
27-
; CHECK-NEXT: ld1 { v1.h }[7], [x8]
2869
; CHECK-NEXT: mov v0.h[6], w6
29-
; CHECK-NEXT: add v2.8h, v1.8h, v1.8h
30-
; CHECK-NEXT: mov v0.h[7], w7
31-
; CHECK-NEXT: add v3.8h, v0.8h, v0.8h
70+
; CHECK-NEXT: add v2.8h, v0.8h, v0.8h
3271
; CHECK-NEXT: uzp1 v0.16b, v0.16b, v1.16b
33-
; CHECK-NEXT: uzp1 v1.16b, v3.16b, v2.16b
72+
; CHECK-NEXT: uzp1 v1.16b, v2.16b, v3.16b
3473
; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b
3574
; CHECK-NEXT: ret
3675
%a1 = insertelement <16 x i16> poison, i16 %a, i16 0
@@ -41,14 +80,14 @@ define <16 x i8> @lower_trunc_16xi8(i16 %a, i16 %b, i16 %c, i16 %d, i16 %e, i16
4180
%f1 = insertelement <16 x i16> %e1, i16 %f, i16 5
4281
%g1 = insertelement <16 x i16> %f1, i16 %g, i16 6
4382
%h1 = insertelement <16 x i16> %g1, i16 %h, i16 7
44-
%i1 = insertelement <16 x i16> %h1, i16 %i, i16 8
45-
%j1 = insertelement <16 x i16> %i1, i16 %j, i16 9
46-
%k1 = insertelement <16 x i16> %j1, i16 %k, i16 10
47-
%l1 = insertelement <16 x i16> %k1, i16 %l, i16 11
48-
%m1 = insertelement <16 x i16> %l1, i16 %m, i16 12
49-
%n1 = insertelement <16 x i16> %m1, i16 %n, i16 13
50-
%o1 = insertelement <16 x i16> %n1, i16 %o, i16 14
51-
%p1 = insertelement <16 x i16> %o1, i16 %p, i16 15
83+
%i1 = insertelement <16 x i16> %f1, i16 %i, i16 8
84+
%j1 = insertelement <16 x i16> %g1, i16 %j, i16 9
85+
%k1 = insertelement <16 x i16> %f1, i16 %k, i16 10
86+
%l1 = insertelement <16 x i16> %g1, i16 %l, i16 11
87+
%m1 = insertelement <16 x i16> %f1, i16 %m, i16 12
88+
%n1 = insertelement <16 x i16> %g1, i16 %n, i16 13
89+
%o1 = insertelement <16 x i16> %f1, i16 %o, i16 14
90+
%p1 = insertelement <16 x i16> %g1, i16 %p, i16 15
5291
%t = trunc <16 x i16> %p1 to <16 x i8>
5392
%s = add <16 x i16> %p1, %p1
5493
%t2 = trunc <16 x i16> %s to <16 x i8>

llvm/test/CodeGen/AArch64/concat-vector-add-combine.ll

Lines changed: 0 additions & 120 deletions
This file was deleted.

0 commit comments

Comments
 (0)