@@ -857,6 +857,67 @@ define <4 x double> @shuffle_v4f64_2345_0567_select(<4 x double> %vec1, <4 x dou
857857 ret <4 x double > %res
858858}
859859
860+ ; PR140234
861+ define <4 x double > @shuffle_v4f64_1436_split_load (ptr %px , ptr %py ) {
862+ ; AVX1-LABEL: shuffle_v4f64_1436_split_load:
863+ ; AVX1: # %bb.0:
864+ ; AVX1-NEXT: vmovapd (%rsi), %xmm0
865+ ; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
866+ ; AVX1-NEXT: vmovupd (%rdi), %ymm1
867+ ; AVX1-NEXT: vinsertf128 $1, 16(%rsi), %ymm0, %ymm0
868+ ; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[2]
869+ ; AVX1-NEXT: retq
870+ ;
871+ ; AVX2-LABEL: shuffle_v4f64_1436_split_load:
872+ ; AVX2: # %bb.0:
873+ ; AVX2-NEXT: vmovapd (%rsi), %xmm0
874+ ; AVX2-NEXT: vmovupd (%rdi), %ymm1
875+ ; AVX2-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[1],ymm0[0],ymm1[3],ymm0[3]
876+ ; AVX2-NEXT: vbroadcastsd 16(%rsi), %ymm1
877+ ; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
878+ ; AVX2-NEXT: retq
879+ ;
880+ ; AVX512VL-SLOW-LABEL: shuffle_v4f64_1436_split_load:
881+ ; AVX512VL-SLOW: # %bb.0:
882+ ; AVX512VL-SLOW-NEXT: vmovapd (%rsi), %xmm0
883+ ; AVX512VL-SLOW-NEXT: vmovupd (%rdi), %ymm1
884+ ; AVX512VL-SLOW-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[1],ymm0[0],ymm1[3],ymm0[3]
885+ ; AVX512VL-SLOW-NEXT: vbroadcastsd 16(%rsi), %ymm1
886+ ; AVX512VL-SLOW-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
887+ ; AVX512VL-SLOW-NEXT: retq
888+ ;
889+ ; AVX512VL-FAST-ALL-LABEL: shuffle_v4f64_1436_split_load:
890+ ; AVX512VL-FAST-ALL: # %bb.0:
891+ ; AVX512VL-FAST-ALL-NEXT: vmovapd (%rsi), %xmm0
892+ ; AVX512VL-FAST-ALL-NEXT: vmovapd 16(%rsi), %xmm1
893+ ; AVX512VL-FAST-ALL-NEXT: vmovupd (%rdi), %ymm2
894+ ; AVX512VL-FAST-ALL-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1],ymm0[0],ymm2[3],ymm0[3]
895+ ; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,1,2,4]
896+ ; AVX512VL-FAST-ALL-NEXT: vpermi2pd %ymm1, %ymm2, %ymm0
897+ ; AVX512VL-FAST-ALL-NEXT: retq
898+ ;
899+ ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v4f64_1436_split_load:
900+ ; AVX512VL-FAST-PERLANE: # %bb.0:
901+ ; AVX512VL-FAST-PERLANE-NEXT: vmovapd (%rsi), %xmm0
902+ ; AVX512VL-FAST-PERLANE-NEXT: vmovupd (%rdi), %ymm1
903+ ; AVX512VL-FAST-PERLANE-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[1],ymm0[0],ymm1[3],ymm0[3]
904+ ; AVX512VL-FAST-PERLANE-NEXT: vbroadcastsd 16(%rsi), %ymm1
905+ ; AVX512VL-FAST-PERLANE-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
906+ ; AVX512VL-FAST-PERLANE-NEXT: retq
907+ %pxhi = getelementptr inbounds nuw i8 , ptr %px , i64 16
908+ %pyhi = getelementptr inbounds nuw i8 , ptr %py , i64 16
909+ %x0 = load <2 x double >, ptr %px , align 16
910+ %y0 = load <2 x double >, ptr %py , align 16
911+ %x1 = load <2 x double >, ptr %pxhi , align 16
912+ %y1 = load <2 x double >, ptr %pyhi , align 16
913+ %shuf0 = shufflevector <2 x double > %x0 , <2 x double > %y0 , <4 x i32 > <i32 1 , i32 2 , i32 poison, i32 poison>
914+ %shuf1 = shufflevector <2 x double > %x1 , <2 x double > poison, <4 x i32 > <i32 poison, i32 1 , i32 poison, i32 poison>
915+ %shuf2 = shufflevector <4 x double > %shuf0 , <4 x double > %shuf1 , <4 x i32 > <i32 0 , i32 1 , i32 5 , i32 poison>
916+ %shuf3 = shufflevector <2 x double > %y1 , <2 x double > poison, <4 x i32 > <i32 0 , i32 poison, i32 poison, i32 poison>
917+ %shuf4 = shufflevector <4 x double > %shuf2 , <4 x double > %shuf3 , <4 x i32 > <i32 0 , i32 1 , i32 2 , i32 4 >
918+ ret <4 x double > %shuf4
919+ }
920+
860921define <4 x i64 > @shuffle_v4i64_0000 (<4 x i64 > %a , <4 x i64 > %b ) {
861922; AVX1-LABEL: shuffle_v4i64_0000:
862923; AVX1: # %bb.0:
0 commit comments