Skip to content

Commit 13b56cc

Browse files
RKSimonmemfrob
authored andcommitted
[X86][SSE] combineX86ShufflesRecursively - bail if constant folding fails due to oneuse limits.
Fixes issue reported on D105827 where a single shuffle of a constant (with multiple uses) was caught in an infinite loop where one shuffle (UNPCKL) used an undef arg but then that got recombined to SHUFPS as the constant value had its own undef that confused matching.....
1 parent 88cc292 commit 13b56cc

File tree

2 files changed

+140
-0
lines changed

2 files changed

+140
-0
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37085,6 +37085,18 @@ static SDValue combineX86ShufflesRecursively(
3708537085
Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
3708637086
return Cst;
3708737087

37088+
// If constant fold failed and we only have constants - then we have
37089+
// multiple uses by a single non-variable shuffle - just bail.
37090+
if (Depth == 0 && llvm::all_of(Ops, [&](SDValue Op) {
37091+
APInt UndefElts;
37092+
SmallVector<APInt> RawBits;
37093+
unsigned EltSizeInBits = RootSizeInBits / Mask.size();
37094+
return getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
37095+
RawBits);
37096+
})) {
37097+
return SDValue();
37098+
}
37099+
3708837100
// Canonicalize the combined shuffle mask chain with horizontal ops.
3708937101
// NOTE: This will update the Ops and Mask.
3709037102
if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(

llvm/test/CodeGen/X86/vector-shuffle-combining.ll

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3287,3 +3287,131 @@ define void @PR45604(<32 x i16>* %dst, <8 x i16>* %src) {
32873287
store <32 x i16> %v3, <32 x i16>* %dst, align 16
32883288
ret void
32893289
}
3290+
3291+
; Test case reported on D105827
3292+
define void @SpinningCube() {
3293+
; SSE2-LABEL: SpinningCube:
3294+
; SSE2: # %bb.0: # %entry
3295+
; SSE2-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000
3296+
; SSE2-NEXT: movaps {{.*#+}} xmm0 = <u,u,u,1.0E+0>
3297+
; SSE2-NEXT: movaps {{.*#+}} xmm1 = <0.0E+0,-2.0E+0,u,u>
3298+
; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
3299+
; SSE2-NEXT: movaps %xmm2, %xmm3
3300+
; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[1,3]
3301+
; SSE2-NEXT: xorps %xmm4, %xmm4
3302+
; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0]
3303+
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[2,3]
3304+
; SSE2-NEXT: addps %xmm4, %xmm2
3305+
; SSE2-NEXT: movaps %xmm2, (%rax)
3306+
; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
3307+
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0,0,0]
3308+
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3]
3309+
; SSE2-NEXT: mulps %xmm2, %xmm1
3310+
; SSE2-NEXT: addps %xmm0, %xmm1
3311+
; SSE2-NEXT: movaps %xmm1, (%rax)
3312+
; SSE2-NEXT: retq
3313+
;
3314+
; SSSE3-LABEL: SpinningCube:
3315+
; SSSE3: # %bb.0: # %entry
3316+
; SSSE3-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000
3317+
; SSSE3-NEXT: movaps {{.*#+}} xmm0 = <u,u,u,1.0E+0>
3318+
; SSSE3-NEXT: movaps {{.*#+}} xmm1 = <0.0E+0,-2.0E+0,u,u>
3319+
; SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
3320+
; SSSE3-NEXT: movaps %xmm2, %xmm3
3321+
; SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[1,3]
3322+
; SSSE3-NEXT: xorps %xmm4, %xmm4
3323+
; SSSE3-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0]
3324+
; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[2,3]
3325+
; SSSE3-NEXT: addps %xmm4, %xmm2
3326+
; SSSE3-NEXT: movaps %xmm2, (%rax)
3327+
; SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
3328+
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3]
3329+
; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0,0,2]
3330+
; SSSE3-NEXT: mulps %xmm1, %xmm2
3331+
; SSSE3-NEXT: addps %xmm0, %xmm2
3332+
; SSSE3-NEXT: movaps %xmm2, (%rax)
3333+
; SSSE3-NEXT: retq
3334+
;
3335+
; SSE41-LABEL: SpinningCube:
3336+
; SSE41: # %bb.0: # %entry
3337+
; SSE41-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000
3338+
; SSE41-NEXT: movaps {{.*#+}} xmm0 = <u,u,u,1.0E+0>
3339+
; SSE41-NEXT: movaps {{.*#+}} xmm1 = <0.0E+0,-2.0E+0,u,u>
3340+
; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3]
3341+
; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
3342+
; SSE41-NEXT: movaps %xmm1, %xmm3
3343+
; SSE41-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[0]
3344+
; SSE41-NEXT: movaps %xmm0, %xmm4
3345+
; SSE41-NEXT: insertps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[2,3]
3346+
; SSE41-NEXT: addps %xmm3, %xmm4
3347+
; SSE41-NEXT: movaps %xmm4, (%rax)
3348+
; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
3349+
; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0,0,2]
3350+
; SSE41-NEXT: mulps %xmm1, %xmm2
3351+
; SSE41-NEXT: addps %xmm0, %xmm2
3352+
; SSE41-NEXT: movaps %xmm2, (%rax)
3353+
; SSE41-NEXT: retq
3354+
;
3355+
; AVX1-LABEL: SpinningCube:
3356+
; AVX1: # %bb.0: # %entry
3357+
; AVX1-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000
3358+
; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = <u,u,u,1.0E+0>
3359+
; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = <0.0E+0,-2.0E+0,u,u>
3360+
; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[0,0,1,3]
3361+
; AVX1-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
3362+
; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
3363+
; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[2,3]
3364+
; AVX1-NEXT: vaddps %xmm3, %xmm2, %xmm2
3365+
; AVX1-NEXT: vmovaps %xmm2, (%rax)
3366+
; AVX1-NEXT: vbroadcastss (%rax), %xmm2
3367+
; AVX1-NEXT: vmulps %xmm1, %xmm2, %xmm1
3368+
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,3]
3369+
; AVX1-NEXT: vaddps %xmm0, %xmm1, %xmm0
3370+
; AVX1-NEXT: vmovaps %xmm0, (%rax)
3371+
; AVX1-NEXT: retq
3372+
;
3373+
; AVX2-LABEL: SpinningCube:
3374+
; AVX2: # %bb.0: # %entry
3375+
; AVX2-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000
3376+
; AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
3377+
; AVX2-NEXT: vmovaps {{.*#+}} xmm1 = <0.0E+0,-2.0E+0,u,u>
3378+
; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[0,0,1,3]
3379+
; AVX2-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
3380+
; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
3381+
; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[2,3]
3382+
; AVX2-NEXT: vaddps %xmm3, %xmm2, %xmm2
3383+
; AVX2-NEXT: vmovaps %xmm2, (%rax)
3384+
; AVX2-NEXT: vbroadcastss (%rax), %xmm2
3385+
; AVX2-NEXT: vmulps %xmm1, %xmm2, %xmm1
3386+
; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,3]
3387+
; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0
3388+
; AVX2-NEXT: vmovaps %xmm0, (%rax)
3389+
; AVX2-NEXT: retq
3390+
entry:
3391+
store float 1.000000e+00, float* undef, align 4
3392+
%0 = load float, float* undef, align 4
3393+
%1 = fmul float undef, 0.000000e+00
3394+
%2 = insertelement <4 x float> poison, float %0, i32 3
3395+
%3 = load float, float* undef, align 4
3396+
%4 = insertelement <2 x float> poison, float %3, i32 0
3397+
%5 = shufflevector <2 x float> %4, <2 x float> poison, <2 x i32> zeroinitializer
3398+
%6 = fmul <2 x float> %5, <float 0.000000e+00, float -2.000000e+00>
3399+
%7 = fadd float %1, undef
3400+
%8 = shufflevector <2 x float> %6, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
3401+
%9 = shufflevector <4 x float> undef, <4 x float> %8, <4 x i32> <i32 0, i32 4, i32 5, i32 undef>
3402+
%10 = insertelement <4 x float> %9, float %7, i32 3
3403+
%11 = insertelement <4 x float> %2, float 0x7FF8000000000000, i32 1
3404+
%12 = insertelement <4 x float> %11, float undef, i32 0
3405+
%13 = insertelement <4 x float> %12, float undef, i32 2
3406+
%14 = fadd <4 x float> %10, %13
3407+
store <4 x float> %14, <4 x float>* undef, align 16
3408+
%15 = load float, float* undef, align 4
3409+
%16 = insertelement <2 x float> poison, float %15, i32 0
3410+
%17 = shufflevector <2 x float> %16, <2 x float> poison, <2 x i32> zeroinitializer
3411+
%18 = fmul <2 x float> %17, <float 0.000000e+00, float -2.000000e+00>
3412+
%19 = shufflevector <2 x float> %18, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
3413+
%20 = shufflevector <4 x float> undef, <4 x float> %19, <4 x i32> <i32 0, i32 4, i32 5, i32 undef>
3414+
%21 = fadd <4 x float> %20, %2
3415+
store <4 x float> %21, <4 x float>* undef, align 16
3416+
ret void
3417+
}

0 commit comments

Comments
 (0)