Skip to content

Commit 197ead7

Browse files
authored
[X86] splitVector - split concat_vectors(a,b,c,d) -> concat_vectors(a,b) + concat_vectors(c,d) (llvm#133753)
Similar to what we already for build_vectors during subvector extraction, when splitting concat_vectors nodes, attempt to create a pair of half size concat_vectors nodes to see if these can fold.
1 parent e47d3a3 commit 197ead7

13 files changed

+1088
-1098
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4326,6 +4326,18 @@ static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
43264326
assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
43274327
"Can't split odd sized vector");
43284328

4329+
if (Op.getOpcode() == ISD::CONCAT_VECTORS) {
4330+
assert((Op.getNumOperands() % 2) == 0 &&
4331+
"Can't split odd sized vector concat");
4332+
unsigned HalfOps = Op.getNumOperands() / 2;
4333+
EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
4334+
SmallVector<SDValue, 2> LoOps(Op->op_begin(), Op->op_begin() + HalfOps);
4335+
SmallVector<SDValue, 2> HiOps(Op->op_begin() + HalfOps, Op->op_end());
4336+
SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, LoOps);
4337+
SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, HiOps);
4338+
return std::make_pair(Lo, Hi);
4339+
}
4340+
43294341
// If this is a splat value (with no-undefs) then use the lower subvector,
43304342
// which should be a free extraction.
43314343
SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);

llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -2410,19 +2410,19 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
24102410
; AVX512-NEXT: vmovdqa 96(%rdi), %xmm3
24112411
; AVX512-NEXT: vmovdqa 112(%rdi), %xmm4
24122412
; AVX512-NEXT: vmovdqa 128(%rdi), %xmm5
2413-
; AVX512-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
24142413
; AVX512-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
2415-
; AVX512-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4
2416-
; AVX512-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
2417-
; AVX512-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5
2418-
; AVX512-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
24192414
; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
24202415
; AVX512-NEXT: # ymm6 = mem[0,1,0,1]
24212416
; AVX512-NEXT: vpshufb %ymm6, %ymm0, %ymm0
2417+
; AVX512-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
24222418
; AVX512-NEXT: vpshufb %ymm6, %ymm3, %ymm3
2419+
; AVX512-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
24232420
; AVX512-NEXT: vpshufb %ymm6, %ymm1, %ymm1
2421+
; AVX512-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4
24242422
; AVX512-NEXT: vpshufb %ymm6, %ymm4, %ymm4
2423+
; AVX512-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
24252424
; AVX512-NEXT: vpshufb %ymm6, %ymm2, %ymm2
2425+
; AVX512-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5
24262426
; AVX512-NEXT: vpshufb %ymm6, %ymm5, %ymm5
24272427
; AVX512-NEXT: vpalignr {{.*#+}} ymm6 = ymm5[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm5[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
24282428
; AVX512-NEXT: vpalignr {{.*#+}} ymm7 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26]
@@ -2457,19 +2457,19 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
24572457
; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm3
24582458
; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm4
24592459
; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %xmm5
2460-
; AVX512-FCP-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
24612460
; AVX512-FCP-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
2462-
; AVX512-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4
2463-
; AVX512-FCP-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
2464-
; AVX512-FCP-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5
2465-
; AVX512-FCP-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
24662461
; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
24672462
; AVX512-FCP-NEXT: # ymm6 = mem[0,1,0,1]
24682463
; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0
2464+
; AVX512-FCP-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
24692465
; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm3
2466+
; AVX512-FCP-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
24702467
; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1
2468+
; AVX512-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4
24712469
; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm4
2470+
; AVX512-FCP-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
24722471
; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2
2472+
; AVX512-FCP-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5
24732473
; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5
24742474
; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm6 = ymm5[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm5[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
24752475
; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm7 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26]
@@ -2504,19 +2504,19 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
25042504
; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm3
25052505
; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm4
25062506
; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm5
2507-
; AVX512DQ-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
25082507
; AVX512DQ-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
2509-
; AVX512DQ-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4
2510-
; AVX512DQ-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
2511-
; AVX512DQ-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5
2512-
; AVX512DQ-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
25132508
; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
25142509
; AVX512DQ-NEXT: # ymm6 = mem[0,1,0,1]
25152510
; AVX512DQ-NEXT: vpshufb %ymm6, %ymm0, %ymm0
2511+
; AVX512DQ-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
25162512
; AVX512DQ-NEXT: vpshufb %ymm6, %ymm3, %ymm3
2513+
; AVX512DQ-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
25172514
; AVX512DQ-NEXT: vpshufb %ymm6, %ymm1, %ymm1
2515+
; AVX512DQ-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4
25182516
; AVX512DQ-NEXT: vpshufb %ymm6, %ymm4, %ymm4
2517+
; AVX512DQ-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
25192518
; AVX512DQ-NEXT: vpshufb %ymm6, %ymm2, %ymm2
2519+
; AVX512DQ-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5
25202520
; AVX512DQ-NEXT: vpshufb %ymm6, %ymm5, %ymm5
25212521
; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm6 = ymm5[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm5[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
25222522
; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm7 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26]
@@ -2551,19 +2551,19 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
25512551
; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm3
25522552
; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm4
25532553
; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %xmm5
2554-
; AVX512DQ-FCP-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
25552554
; AVX512DQ-FCP-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
2556-
; AVX512DQ-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4
2557-
; AVX512DQ-FCP-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
2558-
; AVX512DQ-FCP-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5
2559-
; AVX512DQ-FCP-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
25602555
; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
25612556
; AVX512DQ-FCP-NEXT: # ymm6 = mem[0,1,0,1]
25622557
; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0
2558+
; AVX512DQ-FCP-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
25632559
; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm3
2560+
; AVX512DQ-FCP-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
25642561
; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1
2562+
; AVX512DQ-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4
25652563
; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm4
2564+
; AVX512DQ-FCP-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
25662565
; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2
2566+
; AVX512DQ-FCP-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5
25672567
; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5
25682568
; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm6 = ymm5[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm5[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
25692569
; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm7 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26]

llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll

Lines changed: 30 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -552,23 +552,21 @@ define void @store_i16_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
552552
; AVX512-FCP-LABEL: store_i16_stride3_vf8:
553553
; AVX512-FCP: # %bb.0:
554554
; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
555-
; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
556-
; AVX512-FCP-NEXT: vinserti32x4 $2, (%rdx), %zmm0, %zmm0
557-
; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
558-
; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
559-
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u]
560-
; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm0, %ymm2
561-
; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,2,3,3]
562-
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6],xmm3[7]
563-
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,0,0,0,1,1,0,2]
564-
; AVX512-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2
565-
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,4,1,5,1,5,2,6]
566-
; AVX512-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0
567-
; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5],zero,zero,ymm0[2,3,6,7],zero,zero,ymm0[8,9,12,13],zero,zero,ymm0[18,19,22,23],zero,zero,ymm0[24,25,28,29],zero,zero,ymm0[26,27]
568-
; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm2 & ~mem)
569-
; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
570-
; AVX512-FCP-NEXT: vmovdqa %xmm1, 32(%rcx)
571-
; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rcx)
555+
; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm1
556+
; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm2
557+
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,8,1,9,1,9,2,10]
558+
; AVX512-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm3
559+
; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,4,5],zero,zero,ymm3[2,3,6,7],zero,zero,ymm3[8,9,12,13],zero,zero,ymm3[18,19,22,23],zero,zero,ymm3[24,25,28,29],zero,zero,ymm3[26,27]
560+
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,1,0,2]
561+
; AVX512-FCP-NEXT: vpermd %ymm2, %ymm4, %ymm4
562+
; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = (ymm4 & ~mem) | ymm3
563+
; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
564+
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u]
565+
; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
566+
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
567+
; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm1
568+
; AVX512-FCP-NEXT: vmovdqa %xmm0, 32(%rcx)
569+
; AVX512-FCP-NEXT: vmovdqa %ymm1, (%rcx)
572570
; AVX512-FCP-NEXT: vzeroupper
573571
; AVX512-FCP-NEXT: retq
574572
;
@@ -599,23 +597,21 @@ define void @store_i16_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
599597
; AVX512DQ-FCP-LABEL: store_i16_stride3_vf8:
600598
; AVX512DQ-FCP: # %bb.0:
601599
; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
602-
; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
603-
; AVX512DQ-FCP-NEXT: vinserti32x4 $2, (%rdx), %zmm0, %zmm0
604-
; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
605-
; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
606-
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u]
607-
; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm0, %ymm2
608-
; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,2,3,3]
609-
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6],xmm3[7]
610-
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,0,0,0,1,1,0,2]
611-
; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2
612-
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,4,1,5,1,5,2,6]
613-
; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0
614-
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5],zero,zero,ymm0[2,3,6,7],zero,zero,ymm0[8,9,12,13],zero,zero,ymm0[18,19,22,23],zero,zero,ymm0[24,25,28,29],zero,zero,ymm0[26,27]
615-
; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm2 & ~mem)
616-
; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
617-
; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, 32(%rcx)
618-
; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rcx)
600+
; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm1
601+
; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm2
602+
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,8,1,9,1,9,2,10]
603+
; AVX512DQ-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm3
604+
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,4,5],zero,zero,ymm3[2,3,6,7],zero,zero,ymm3[8,9,12,13],zero,zero,ymm3[18,19,22,23],zero,zero,ymm3[24,25,28,29],zero,zero,ymm3[26,27]
605+
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,1,0,2]
606+
; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm4, %ymm4
607+
; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = (ymm4 & ~mem) | ymm3
608+
; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
609+
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u]
610+
; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
611+
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
612+
; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm1
613+
; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, 32(%rcx)
614+
; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, (%rcx)
619615
; AVX512DQ-FCP-NEXT: vzeroupper
620616
; AVX512DQ-FCP-NEXT: retq
621617
;

llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -513,11 +513,11 @@ define void @store_i16_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
513513
; AVX512: # %bb.0:
514514
; AVX512-NEXT: vmovdqa (%rdi), %xmm0
515515
; AVX512-NEXT: vmovdqa (%rdx), %xmm1
516-
; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
517516
; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
518517
; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm1[1,3,1,3]
519518
; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15]
520519
; AVX512-NEXT: vpshufb %ymm3, %ymm2, %ymm2
520+
; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
521521
; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm0[1,3,1,3]
522522
; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm5 = [151519488,185205506,218891524,252577542]
523523
; AVX512-NEXT: vpshufb %ymm5, %ymm4, %ymm4
@@ -536,11 +536,11 @@ define void @store_i16_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
536536
; AVX512-FCP: # %bb.0:
537537
; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
538538
; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1
539-
; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
540539
; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
541540
; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[1,3,1,3]
542541
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15]
543542
; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2
543+
; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
544544
; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[1,3,1,3]
545545
; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm5 = [151519488,185205506,218891524,252577542]
546546
; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4
@@ -559,11 +559,11 @@ define void @store_i16_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
559559
; AVX512DQ: # %bb.0:
560560
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
561561
; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1
562-
; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
563562
; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
564563
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[1,3,1,3]
565564
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15]
566565
; AVX512DQ-NEXT: vpshufb %ymm3, %ymm2, %ymm2
566+
; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
567567
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm0[1,3,1,3]
568568
; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm5 = [151519488,185205506,218891524,252577542]
569569
; AVX512DQ-NEXT: vpshufb %ymm5, %ymm4, %ymm4
@@ -582,11 +582,11 @@ define void @store_i16_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
582582
; AVX512DQ-FCP: # %bb.0:
583583
; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
584584
; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1
585-
; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
586585
; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
587586
; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[1,3,1,3]
588587
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15]
589588
; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2
589+
; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
590590
; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[1,3,1,3]
591591
; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm5 = [151519488,185205506,218891524,252577542]
592592
; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4

0 commit comments

Comments
 (0)