@@ -528,3 +528,70 @@ define i32 @main() nounwind {
528528 %r = add i32 %e1 , %e2
529529 ret i32 %r
530530}
531+
532+ ; A test for incorrect combine for single value extraction from VBROADCAST_LOAD.
533+ ; Wrong combine makes the second call (%t8) use the stored result in the
534+ ; previous instructions instead of %t4.
535+ declare <2 x float > @ccosf (<2 x float >)
536+ define dso_local <2 x float > @multiuse_of_single_value_from_vbroadcast_load (ptr %p , ptr %arr ) nounwind {
537+ ; X86-SSE2-LABEL: multiuse_of_single_value_from_vbroadcast_load:
538+ ; X86-SSE2: # %bb.0:
539+ ; X86-SSE2-NEXT: pushl %esi
540+ ; X86-SSE2-NEXT: subl $16, %esp
541+ ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
542+ ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi
543+ ; X86-SSE2-NEXT: movups 24(%esi), %xmm0
544+ ; X86-SSE2-NEXT: movups %xmm0, (%esp) # 16-byte Spill
545+ ; X86-SSE2-NEXT: movhps %xmm0, (%eax)
546+ ; X86-SSE2-NEXT: movaps 32(%esi), %xmm0
547+ ; X86-SSE2-NEXT: calll ccosf@PLT
548+ ; X86-SSE2-NEXT: movlps %xmm0, 32(%esi)
549+ ; X86-SSE2-NEXT: movups (%esp), %xmm0 # 16-byte Reload
550+ ; X86-SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
551+ ; X86-SSE2-NEXT: calll ccosf@PLT
552+ ; X86-SSE2-NEXT: addl $16, %esp
553+ ; X86-SSE2-NEXT: popl %esi
554+ ; X86-SSE2-NEXT: retl
555+ ;
556+ ; X64-SSSE3-LABEL: multiuse_of_single_value_from_vbroadcast_load:
557+ ; X64-SSSE3: # %bb.0:
558+ ; X64-SSSE3-NEXT: pushq %rbx
559+ ; X64-SSSE3-NEXT: subq $16, %rsp
560+ ; X64-SSSE3-NEXT: movq %rsi, %rbx
561+ ; X64-SSSE3-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
562+ ; X64-SSSE3-NEXT: movapd %xmm0, (%rsp) # 16-byte Spill
563+ ; X64-SSSE3-NEXT: movlpd %xmm0, (%rdi)
564+ ; X64-SSSE3-NEXT: movaps 32(%rsi), %xmm0
565+ ; X64-SSSE3-NEXT: callq ccosf@PLT
566+ ; X64-SSSE3-NEXT: movlps %xmm0, 32(%rbx)
567+ ; X64-SSSE3-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
568+ ; X64-SSSE3-NEXT: callq ccosf@PLT
569+ ; X64-SSSE3-NEXT: addq $16, %rsp
570+ ; X64-SSSE3-NEXT: popq %rbx
571+ ; X64-SSSE3-NEXT: retq
572+ ;
573+ ; X64-AVX-LABEL: multiuse_of_single_value_from_vbroadcast_load:
574+ ; X64-AVX: # %bb.0:
575+ ; X64-AVX-NEXT: pushq %rbx
576+ ; X64-AVX-NEXT: movq %rsi, %rbx
577+ ; X64-AVX-NEXT: vmovsd 32(%rsi), %xmm0 # xmm0 = mem[0],zero
578+ ; X64-AVX-NEXT: vmovsd %xmm0, (%rdi)
579+ ; X64-AVX-NEXT: vmovaps 32(%rsi), %xmm0
580+ ; X64-AVX-NEXT: callq ccosf@PLT
581+ ; X64-AVX-NEXT: vmovlps %xmm0, 32(%rbx)
582+ ; X64-AVX-NEXT: vmovddup 32(%rbx), %xmm0 # xmm0 = mem[0,0]
583+ ; X64-AVX-NEXT: callq ccosf@PLT
584+ ; X64-AVX-NEXT: popq %rbx
585+ ; X64-AVX-NEXT: retq
586+ %p1 = getelementptr [5 x <2 x float >], ptr %arr , i64 0 , i64 3
587+ %p2 = getelementptr inbounds [5 x <2 x float >], ptr %arr , i64 0 , i64 4 , i32 0
588+ %t3 = load <4 x float >, ptr %p1 , align 8
589+ %t4 = shufflevector <4 x float > %t3 , <4 x float > poison, <2 x i32 > <i32 2 , i32 3 >
590+ store <2 x float > %t4 , ptr %p , align 16
591+ %t5 = load <4 x float >, ptr %p2 , align 32
592+ %t6 = shufflevector <4 x float > %t5 , <4 x float > poison, <2 x i32 > <i32 0 , i32 1 >
593+ %t7 = call <2 x float > @ccosf (<2 x float > %t6 )
594+ store <2 x float > %t7 , ptr %p2 , align 32
595+ %t8 = call <2 x float > @ccosf (<2 x float > %t4 )
596+ ret <2 x float > %t8
597+ }
0 commit comments