Skip to content

Commit 93df21f

Browse files
committed
[x86] Implement optimization and update tests
1 parent 2b2130a commit 93df21f

File tree

2 files changed

+52
-40
lines changed

2 files changed

+52
-40
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 45 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -47081,7 +47081,8 @@ static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
4708147081
/// scalars back, while for x64 we should use 64-bit extracts and shifts.
4708247082
static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
4708347083
TargetLowering::DAGCombinerInfo &DCI,
47084-
const X86Subtarget &Subtarget) {
47084+
const X86Subtarget &Subtarget,
47085+
bool& TransformedBinOpReduction) {
4708547086
if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
4708647087
return NewOp;
4708747088

@@ -47169,23 +47170,33 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
4716947170
// Check whether this extract is the root of a sum of absolute differences
4717047171
// pattern. This has to be done here because we really want it to happen
4717147172
// pre-legalization,
47172-
if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
47173+
if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget)) {
47174+
TransformedBinOpReduction = true;
4717347175
return SAD;
47176+
}
4717447177

47175-
if (SDValue VPDPBUSD = combineVPDPBUSDPattern(N, DAG, Subtarget))
47178+
if (SDValue VPDPBUSD = combineVPDPBUSDPattern(N, DAG, Subtarget)) {
47179+
TransformedBinOpReduction = true;
4717647180
return VPDPBUSD;
47181+
}
4717747182

4717847183
// Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
47179-
if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))
47184+
if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget)) {
47185+
TransformedBinOpReduction = true;
4718047186
return Cmp;
47187+
}
4718147188

4718247189
// Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
47183-
if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))
47190+
if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget)) {
47191+
TransformedBinOpReduction = true;
4718447192
return MinMax;
47193+
}
4718547194

4718647195
// Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..
47187-
if (SDValue V = combineArithReduction(N, DAG, Subtarget))
47196+
if (SDValue V = combineArithReduction(N, DAG, Subtarget)) {
47197+
TransformedBinOpReduction = true;
4718847198
return V;
47199+
}
4718947200

4719047201
if (SDValue V = scalarizeExtEltFP(N, DAG, Subtarget, DCI))
4719147202
return V;
@@ -47255,6 +47266,33 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
4725547266
return SDValue();
4725647267
}
4725747268

47269+
static SDValue combineExtractVectorEltAndOperand(SDNode* N, SelectionDAG& DAG,
47270+
TargetLowering::DAGCombinerInfo& DCI,
47271+
const X86Subtarget& Subtarget)
47272+
{
47273+
bool TransformedBinOpReduction = false;
47274+
auto Op = combineExtractVectorElt(N, DAG, DCI, Subtarget, TransformedBinOpReduction);
47275+
47276+
if (TransformedBinOpReduction)
47277+
{
47278+
// In case we simplified N = extract_vector_element(V, 0) with Op and V
47279+
// resulted from a reduction, then we need to replace all uses of V with
47280+
// scalar_to_vector(Op) to make sure that we eliminated the binop + shuffle
47281+
// pyramid. This is safe to do, because the elements of V are undefined except
47282+
// for the zeroth element.
47283+
47284+
auto OldV = N->getOperand(0);
47285+
auto NewV = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), OldV->getValueType(0), Op);
47286+
47287+
auto NV = DCI.CombineTo(N, Op);
47288+
DCI.CombineTo(OldV.getNode(), NewV);
47289+
47290+
Op = NV; // Return N so it doesn't get rechecked!
47291+
}
47292+
47293+
return Op;
47294+
}
47295+
4725847296
// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
4725947297
// This is more or less the reverse of combineBitcastvxi1.
4726047298
static SDValue combineToExtendBoolVectorInReg(
@@ -60702,7 +60740,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
6070260740
case ISD::EXTRACT_VECTOR_ELT:
6070360741
case X86ISD::PEXTRW:
6070460742
case X86ISD::PEXTRB:
60705-
return combineExtractVectorElt(N, DAG, DCI, Subtarget);
60743+
return combineExtractVectorEltAndOperand(N, DAG, DCI, Subtarget);
6070660744
case ISD::CONCAT_VECTORS:
6070760745
return combineCONCAT_VECTORS(N, DAG, DCI, Subtarget);
6070860746
case ISD::INSERT_SUBVECTOR:

llvm/test/CodeGen/X86/optimize-reduction.ll

Lines changed: 7 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -7,16 +7,9 @@ define { i16, i16 } @test_reduce_v16i16_with_umin(<16 x i16> %x, <16 x i16> %y)
77
; SSE41: # %bb.0:
88
; SSE41-NEXT: movdqa %xmm0, %xmm4
99
; SSE41-NEXT: pminuw %xmm1, %xmm4
10-
; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
11-
; SSE41-NEXT: pminuw %xmm4, %xmm5
12-
; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,1,1]
13-
; SSE41-NEXT: pminuw %xmm5, %xmm6
14-
; SSE41-NEXT: movdqa %xmm6, %xmm5
15-
; SSE41-NEXT: psrld $16, %xmm5
16-
; SSE41-NEXT: pminuw %xmm6, %xmm5
1710
; SSE41-NEXT: phminposuw %xmm4, %xmm4
1811
; SSE41-NEXT: movd %xmm4, %eax
19-
; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,0,0,0,4,5,6,7]
12+
; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7]
2013
; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
2114
; SSE41-NEXT: pcmpeqw %xmm4, %xmm1
2215
; SSE41-NEXT: pcmpeqd %xmm5, %xmm5
@@ -36,14 +29,8 @@ define { i16, i16 } @test_reduce_v16i16_with_umin(<16 x i16> %x, <16 x i16> %y)
3629
; AVX2: # %bb.0:
3730
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
3831
; AVX2-NEXT: vpminuw %xmm2, %xmm0, %xmm2
39-
; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
40-
; AVX2-NEXT: vpminuw %xmm3, %xmm2, %xmm3
41-
; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,1,1]
42-
; AVX2-NEXT: vpminuw %xmm4, %xmm3, %xmm3
43-
; AVX2-NEXT: vpsrld $16, %xmm3, %xmm4
4432
; AVX2-NEXT: vphminposuw %xmm2, %xmm2
4533
; AVX2-NEXT: vmovd %xmm2, %eax
46-
; AVX2-NEXT: vpminuw %xmm4, %xmm3, %xmm2
4734
; AVX2-NEXT: vpbroadcastw %xmm2, %ymm2
4835
; AVX2-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
4936
; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
@@ -71,19 +58,12 @@ define { i16, i16 } @test_reduce_v16i16_with_umin(<16 x i16> %x, <16 x i16> %y)
7158
define { i16, i16 } @test_reduce_v16i16_with_add(<16 x i16> %x, <16 x i16> %y) {
7259
; SSE41-LABEL: test_reduce_v16i16_with_add:
7360
; SSE41: # %bb.0: # %start
74-
; SSE41-NEXT: movdqa %xmm0, %xmm4
75-
; SSE41-NEXT: paddw %xmm1, %xmm4
76-
; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
77-
; SSE41-NEXT: paddw %xmm4, %xmm5
78-
; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,1,1]
79-
; SSE41-NEXT: paddw %xmm5, %xmm4
61+
; SSE41-NEXT: movdqa %xmm1, %xmm4
62+
; SSE41-NEXT: phaddw %xmm0, %xmm4
63+
; SSE41-NEXT: phaddw %xmm4, %xmm4
8064
; SSE41-NEXT: phaddw %xmm4, %xmm4
81-
; SSE41-NEXT: movdqa %xmm1, %xmm5
82-
; SSE41-NEXT: phaddw %xmm0, %xmm5
83-
; SSE41-NEXT: phaddw %xmm5, %xmm5
84-
; SSE41-NEXT: phaddw %xmm5, %xmm5
85-
; SSE41-NEXT: phaddw %xmm5, %xmm5
86-
; SSE41-NEXT: movd %xmm5, %eax
65+
; SSE41-NEXT: phaddw %xmm4, %xmm4
66+
; SSE41-NEXT: movd %xmm4, %eax
8767
; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7]
8868
; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
8969
; SSE41-NEXT: pcmpeqw %xmm4, %xmm1
@@ -103,18 +83,12 @@ define { i16, i16 } @test_reduce_v16i16_with_add(<16 x i16> %x, <16 x i16> %y) {
10383
; AVX2-LABEL: test_reduce_v16i16_with_add:
10484
; AVX2: # %bb.0: # %start
10585
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
106-
; AVX2-NEXT: vpaddw %xmm2, %xmm0, %xmm3
107-
; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
108-
; AVX2-NEXT: vpaddw %xmm4, %xmm3, %xmm3
109-
; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,1,1]
110-
; AVX2-NEXT: vpaddw %xmm4, %xmm3, %xmm3
111-
; AVX2-NEXT: vphaddw %xmm3, %xmm3, %xmm3
11286
; AVX2-NEXT: vphaddw %xmm0, %xmm2, %xmm2
11387
; AVX2-NEXT: vphaddw %xmm2, %xmm2, %xmm2
11488
; AVX2-NEXT: vphaddw %xmm2, %xmm2, %xmm2
11589
; AVX2-NEXT: vphaddw %xmm2, %xmm2, %xmm2
11690
; AVX2-NEXT: vmovd %xmm2, %eax
117-
; AVX2-NEXT: vpbroadcastw %xmm3, %ymm2
91+
; AVX2-NEXT: vpbroadcastw %xmm2, %ymm2
11892
; AVX2-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
11993
; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
12094
; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0

0 commit comments

Comments
 (0)