Skip to content

Commit 3458e8b

Browse files
RKSimonmemfrob
authored andcommitted
[X86][SSE] Fix typo + infinite-loop in HOP(HOP'(X,X),HOP'(Y,Y)) fold (PR52040)
PR52040 identified several issues with the HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)) slow-HOP fold. Not only was there a copy+paste typo when accessing the inner HOP operands, but the (unnecessary) ReplaceAllUsesOfValueWith call was missing one use checks. Now that we have better shuffle combines of HOPs we can just return a new HOP() sequence and not use ReplaceAllUsesOfValueWith at all - this actually improved pair_sum_v8i32_v4i32 codegen as it kicks off further shuffle combines.
1 parent 686cf22 commit 3458e8b

File tree

3 files changed

+34
-12
lines changed

3 files changed

+34
-12
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -45118,17 +45118,18 @@ static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG,
4511845118
// HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).
4511945119
if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&
4512045120
LHS.getOpcode() == RHS.getOpcode() &&
45121-
LHS.getValueType() == RHS.getValueType()) {
45121+
LHS.getValueType() == RHS.getValueType() &&
45122+
N->isOnlyUserOf(LHS.getNode()) && N->isOnlyUserOf(RHS.getNode())) {
4512245123
SDValue LHS0 = LHS.getOperand(0);
45123-
SDValue RHS0 = LHS.getOperand(1);
45124-
SDValue LHS1 = RHS.getOperand(0);
45124+
SDValue LHS1 = LHS.getOperand(1);
45125+
SDValue RHS0 = RHS.getOperand(0);
4512545126
SDValue RHS1 = RHS.getOperand(1);
45126-
if ((LHS0 == RHS0 || LHS0.isUndef() || RHS0.isUndef()) &&
45127-
(LHS1 == RHS1 || LHS1.isUndef() || RHS1.isUndef())) {
45127+
if ((LHS0 == LHS1 || LHS0.isUndef() || LHS1.isUndef()) &&
45128+
(RHS0 == RHS1 || RHS0.isUndef() || RHS1.isUndef())) {
4512845129
SDLoc DL(N);
4512945130
SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(),
45130-
LHS0.isUndef() ? RHS0 : LHS0,
45131-
LHS1.isUndef() ? RHS1 : LHS1);
45131+
LHS0.isUndef() ? LHS1 : LHS0,
45132+
RHS0.isUndef() ? RHS1 : RHS0);
4513245133
MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
4513345134
Res = DAG.getBitcast(ShufVT, Res);
4513445135
SDValue NewLHS =
@@ -45137,9 +45138,8 @@ static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG,
4513745138
SDValue NewRHS =
4513845139
DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
4513945140
getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG));
45140-
DAG.ReplaceAllUsesOfValueWith(LHS, DAG.getBitcast(VT, NewLHS));
45141-
DAG.ReplaceAllUsesOfValueWith(RHS, DAG.getBitcast(VT, NewRHS));
45142-
return SDValue(N, 0);
45141+
return DAG.getNode(N->getOpcode(), DL, VT, DAG.getBitcast(VT, NewLHS),
45142+
DAG.getBitcast(VT, NewRHS));
4514345143
}
4514445144
}
4514545145
}

llvm/test/CodeGen/X86/horizontal-shuffle-2.ll

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,28 @@ define <8 x i16> @PR51974(<8 x i16> %a0) {
190190
ret <8 x i16> %r1
191191
}
192192

193+
define <8 x i16> @PR52040(<8 x i16> %a0) {
194+
; SSE-LABEL: PR52040:
195+
; SSE: ## %bb.0:
196+
; SSE-NEXT: phaddw %xmm0, %xmm0
197+
; SSE-NEXT: movdqa %xmm0, %xmm1
198+
; SSE-NEXT: phaddw %xmm0, %xmm1
199+
; SSE-NEXT: phaddw %xmm0, %xmm1
200+
; SSE-NEXT: movdqa %xmm1, %xmm0
201+
; SSE-NEXT: ret{{[l|q]}}
202+
;
203+
; AVX-LABEL: PR52040:
204+
; AVX: ## %bb.0:
205+
; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0
206+
; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm1
207+
; AVX-NEXT: vphaddw %xmm0, %xmm1, %xmm0
208+
; AVX-NEXT: ret{{[l|q]}}
209+
%r1 = tail call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a0, <8 x i16> %a0)
210+
%r2 = tail call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %r1, <8 x i16> %r1)
211+
%r3 = tail call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %r2, <8 x i16> %r1)
212+
ret <8 x i16> %r3
213+
}
214+
193215
declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>)
194216
declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>)
195217
declare <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double>, <2 x double>)

llvm/test/CodeGen/X86/horizontal-sum.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -364,8 +364,8 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
364364
; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
365365
; SSSE3-SLOW-NEXT: phaddd %xmm7, %xmm6
366366
; SSSE3-SLOW-NEXT: phaddd %xmm6, %xmm6
367-
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,1,1,1]
368-
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm2[0,2]
367+
; SSSE3-SLOW-NEXT: palignr {{.*#+}} xmm6 = xmm1[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
368+
; SSSE3-SLOW-NEXT: movdqa %xmm6, %xmm1
369369
; SSSE3-SLOW-NEXT: retq
370370
;
371371
; SSSE3-FAST-LABEL: pair_sum_v8i32_v4i32:

0 commit comments

Comments
 (0)