@@ -2508,3 +2508,50 @@ define void @D107009(ptr %input, ptr %output) {
25082508 store <64 x i32 > %i7 , ptr %output , align 16
25092509 ret void
25102510}
2511+
2512+ ; Ensure concatenation of repeated subvector loads before vector can be split apart.
2513+ define void @split_v2i64_subvector_broadcast (ptr readonly align 8 captures(none) dereferenceable (64 ) %arg ) {
2514+ ; SSE-LABEL: split_v2i64_subvector_broadcast:
2515+ ; SSE: # %bb.0:
2516+ ; SSE-NEXT: movups 8(%rdi), %xmm0
2517+ ; SSE-NEXT: movups 40(%rdi), %xmm1
2518+ ; SSE-NEXT: movaps %xmm0, %xmm2
2519+ ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
2520+ ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
2521+ ; SSE-NEXT: movups %xmm0, (%rax)
2522+ ; SSE-NEXT: movups %xmm2, (%rax)
2523+ ; SSE-NEXT: retq
2524+ ;
2525+ ; AVX1-LABEL: split_v2i64_subvector_broadcast:
2526+ ; AVX1: # %bb.0:
2527+ ; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
2528+ ; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
2529+ ; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
2530+ ; AVX1-NEXT: vmovupd %ymm0, (%rax)
2531+ ; AVX1-NEXT: vzeroupper
2532+ ; AVX1-NEXT: retq
2533+ ;
2534+ ; AVX2-LABEL: split_v2i64_subvector_broadcast:
2535+ ; AVX2: # %bb.0:
2536+ ; AVX2-NEXT: vmovups 40(%rdi), %xmm0
2537+ ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,1]
2538+ ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,1,3]
2539+ ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
2540+ ; AVX2-NEXT: vmovups %ymm0, (%rax)
2541+ ; AVX2-NEXT: vzeroupper
2542+ ; AVX2-NEXT: retq
2543+ ;
2544+ ; XOP-LABEL: split_v2i64_subvector_broadcast:
2545+ ; XOP: # %bb.0:
2546+ ; XOP-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
2547+ ; XOP-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
2548+ ; XOP-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
2549+ ; XOP-NEXT: vmovupd %ymm0, (%rax)
2550+ ; XOP-NEXT: vzeroupper
2551+ ; XOP-NEXT: retq
2552+ %gep = getelementptr inbounds nuw i8 , ptr %arg , i64 8
2553+ %load = load <6 x i64 >, ptr %gep , align 8
2554+ %shuffle = shufflevector <6 x i64 > %load , <6 x i64 > poison, <4 x i32 > <i32 0 , i32 4 , i32 1 , i32 5 >
2555+ store <4 x i64 > %shuffle , ptr poison, align 8
2556+ ret void
2557+ }
0 commit comments