@@ -2410,19 +2410,19 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
2410
2410
; AVX512-NEXT: vmovdqa 96(%rdi), %xmm3
2411
2411
; AVX512-NEXT: vmovdqa 112(%rdi), %xmm4
2412
2412
; AVX512-NEXT: vmovdqa 128(%rdi), %xmm5
2413
- ; AVX512-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
2414
2413
; AVX512-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
2415
- ; AVX512-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4
2416
- ; AVX512-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
2417
- ; AVX512-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5
2418
- ; AVX512-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
2419
2414
; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
2420
2415
; AVX512-NEXT: # ymm6 = mem[0,1,0,1]
2421
2416
; AVX512-NEXT: vpshufb %ymm6, %ymm0, %ymm0
2417
+ ; AVX512-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
2422
2418
; AVX512-NEXT: vpshufb %ymm6, %ymm3, %ymm3
2419
+ ; AVX512-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
2423
2420
; AVX512-NEXT: vpshufb %ymm6, %ymm1, %ymm1
2421
+ ; AVX512-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4
2424
2422
; AVX512-NEXT: vpshufb %ymm6, %ymm4, %ymm4
2423
+ ; AVX512-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
2425
2424
; AVX512-NEXT: vpshufb %ymm6, %ymm2, %ymm2
2425
+ ; AVX512-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5
2426
2426
; AVX512-NEXT: vpshufb %ymm6, %ymm5, %ymm5
2427
2427
; AVX512-NEXT: vpalignr {{.*#+}} ymm6 = ymm5[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm5[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
2428
2428
; AVX512-NEXT: vpalignr {{.*#+}} ymm7 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26]
@@ -2457,19 +2457,19 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
2457
2457
; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm3
2458
2458
; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm4
2459
2459
; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %xmm5
2460
- ; AVX512-FCP-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
2461
2460
; AVX512-FCP-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
2462
- ; AVX512-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4
2463
- ; AVX512-FCP-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
2464
- ; AVX512-FCP-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5
2465
- ; AVX512-FCP-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
2466
2461
; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
2467
2462
; AVX512-FCP-NEXT: # ymm6 = mem[0,1,0,1]
2468
2463
; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0
2464
+ ; AVX512-FCP-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
2469
2465
; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm3
2466
+ ; AVX512-FCP-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
2470
2467
; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1
2468
+ ; AVX512-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4
2471
2469
; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm4
2470
+ ; AVX512-FCP-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
2472
2471
; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2
2472
+ ; AVX512-FCP-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5
2473
2473
; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5
2474
2474
; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm6 = ymm5[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm5[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
2475
2475
; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm7 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26]
@@ -2504,19 +2504,19 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
2504
2504
; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm3
2505
2505
; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm4
2506
2506
; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm5
2507
- ; AVX512DQ-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
2508
2507
; AVX512DQ-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
2509
- ; AVX512DQ-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4
2510
- ; AVX512DQ-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
2511
- ; AVX512DQ-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5
2512
- ; AVX512DQ-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
2513
2508
; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
2514
2509
; AVX512DQ-NEXT: # ymm6 = mem[0,1,0,1]
2515
2510
; AVX512DQ-NEXT: vpshufb %ymm6, %ymm0, %ymm0
2511
+ ; AVX512DQ-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
2516
2512
; AVX512DQ-NEXT: vpshufb %ymm6, %ymm3, %ymm3
2513
+ ; AVX512DQ-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
2517
2514
; AVX512DQ-NEXT: vpshufb %ymm6, %ymm1, %ymm1
2515
+ ; AVX512DQ-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4
2518
2516
; AVX512DQ-NEXT: vpshufb %ymm6, %ymm4, %ymm4
2517
+ ; AVX512DQ-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
2519
2518
; AVX512DQ-NEXT: vpshufb %ymm6, %ymm2, %ymm2
2519
+ ; AVX512DQ-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5
2520
2520
; AVX512DQ-NEXT: vpshufb %ymm6, %ymm5, %ymm5
2521
2521
; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm6 = ymm5[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm5[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
2522
2522
; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm7 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26]
@@ -2551,19 +2551,19 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
2551
2551
; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm3
2552
2552
; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm4
2553
2553
; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %xmm5
2554
- ; AVX512DQ-FCP-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
2555
2554
; AVX512DQ-FCP-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
2556
- ; AVX512DQ-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4
2557
- ; AVX512DQ-FCP-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
2558
- ; AVX512DQ-FCP-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5
2559
- ; AVX512DQ-FCP-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
2560
2555
; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
2561
2556
; AVX512DQ-FCP-NEXT: # ymm6 = mem[0,1,0,1]
2562
2557
; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0
2558
+ ; AVX512DQ-FCP-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
2563
2559
; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm3
2560
+ ; AVX512DQ-FCP-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
2564
2561
; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1
2562
+ ; AVX512DQ-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4
2565
2563
; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm4
2564
+ ; AVX512DQ-FCP-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
2566
2565
; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2
2566
+ ; AVX512DQ-FCP-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5
2567
2567
; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5
2568
2568
; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm6 = ymm5[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm5[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
2569
2569
; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm7 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26]
0 commit comments