@@ -593,100 +593,104 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
593
593
; AVX512BW-LABEL: load_i16_stride5_vf4:
594
594
; AVX512BW: # %bb.0:
595
595
; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0]
596
- ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1
597
- ; AVX512BW-NEXT: vpermw %zmm1 , %zmm0 , %zmm0
596
+ ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
597
+ ; AVX512BW-NEXT: vpermw %ymm1 , %ymm0 , %ymm0
598
598
; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
599
- ; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm2
599
+ ; AVX512BW-NEXT: vpermw %ymm1, %ymm2, %ymm1
600
+ ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2
600
601
; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm3
601
602
; AVX512BW-NEXT: vpextrw $7, %xmm3, %eax
602
- ; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm2 , %xmm2
603
+ ; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm1 , %xmm1
603
604
; AVX512BW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0
604
605
; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
605
- ; AVX512BW-NEXT: vpermw %zmm1 , %zmm3, %zmm3
606
+ ; AVX512BW-NEXT: vpermw %zmm2 , %zmm3, %zmm3
606
607
; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
607
- ; AVX512BW-NEXT: vpermw %zmm1 , %zmm4, %zmm4
608
+ ; AVX512BW-NEXT: vpermw %zmm2 , %zmm4, %zmm4
608
609
; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
609
- ; AVX512BW-NEXT: vpermw %zmm1 , %zmm5, %zmm1
610
- ; AVX512BW-NEXT: vmovq %xmm2 , (%rsi)
610
+ ; AVX512BW-NEXT: vpermw %zmm2 , %zmm5, %zmm2
611
+ ; AVX512BW-NEXT: vmovq %xmm1 , (%rsi)
611
612
; AVX512BW-NEXT: vmovq %xmm0, (%rdx)
612
613
; AVX512BW-NEXT: vmovq %xmm3, (%rcx)
613
614
; AVX512BW-NEXT: vmovq %xmm4, (%r8)
614
- ; AVX512BW-NEXT: vmovq %xmm1 , (%r9)
615
+ ; AVX512BW-NEXT: vmovq %xmm2 , (%r9)
615
616
; AVX512BW-NEXT: vzeroupper
616
617
; AVX512BW-NEXT: retq
617
618
;
618
619
; AVX512BW-FCP-LABEL: load_i16_stride5_vf4:
619
620
; AVX512BW-FCP: # %bb.0:
620
621
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0]
621
- ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
622
- ; AVX512BW-FCP-NEXT: vpermw %zmm1 , %zmm0 , %zmm0
622
+ ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1
623
+ ; AVX512BW-FCP-NEXT: vpermw %ymm1 , %ymm0 , %ymm0
623
624
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
624
- ; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2
625
+ ; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm2, %ymm1
626
+ ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
625
627
; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm3
626
628
; AVX512BW-FCP-NEXT: vpextrw $7, %xmm3, %eax
627
- ; AVX512BW-FCP-NEXT: vpinsrw $3, %eax, %xmm2 , %xmm2
629
+ ; AVX512BW-FCP-NEXT: vpinsrw $3, %eax, %xmm1 , %xmm1
628
630
; AVX512BW-FCP-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0
629
631
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
630
- ; AVX512BW-FCP-NEXT: vpermw %zmm1 , %zmm3, %zmm3
632
+ ; AVX512BW-FCP-NEXT: vpermw %zmm2 , %zmm3, %zmm3
631
633
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
632
- ; AVX512BW-FCP-NEXT: vpermw %zmm1 , %zmm4, %zmm4
634
+ ; AVX512BW-FCP-NEXT: vpermw %zmm2 , %zmm4, %zmm4
633
635
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
634
- ; AVX512BW-FCP-NEXT: vpermw %zmm1 , %zmm5, %zmm1
635
- ; AVX512BW-FCP-NEXT: vmovq %xmm2 , (%rsi)
636
+ ; AVX512BW-FCP-NEXT: vpermw %zmm2 , %zmm5, %zmm2
637
+ ; AVX512BW-FCP-NEXT: vmovq %xmm1 , (%rsi)
636
638
; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rdx)
637
639
; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx)
638
640
; AVX512BW-FCP-NEXT: vmovq %xmm4, (%r8)
639
- ; AVX512BW-FCP-NEXT: vmovq %xmm1 , (%r9)
641
+ ; AVX512BW-FCP-NEXT: vmovq %xmm2 , (%r9)
640
642
; AVX512BW-FCP-NEXT: vzeroupper
641
643
; AVX512BW-FCP-NEXT: retq
642
644
;
643
645
; AVX512DQ-BW-LABEL: load_i16_stride5_vf4:
644
646
; AVX512DQ-BW: # %bb.0:
645
647
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0]
646
- ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1
647
- ; AVX512DQ-BW-NEXT: vpermw %zmm1 , %zmm0 , %zmm0
648
+ ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm1
649
+ ; AVX512DQ-BW-NEXT: vpermw %ymm1 , %ymm0 , %ymm0
648
650
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
649
- ; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm2
651
+ ; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm2, %ymm1
652
+ ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2
650
653
; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm3
651
654
; AVX512DQ-BW-NEXT: vpextrw $7, %xmm3, %eax
652
- ; AVX512DQ-BW-NEXT: vpinsrw $3, %eax, %xmm2 , %xmm2
655
+ ; AVX512DQ-BW-NEXT: vpinsrw $3, %eax, %xmm1 , %xmm1
653
656
; AVX512DQ-BW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0
654
657
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
655
- ; AVX512DQ-BW-NEXT: vpermw %zmm1 , %zmm3, %zmm3
658
+ ; AVX512DQ-BW-NEXT: vpermw %zmm2 , %zmm3, %zmm3
656
659
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
657
- ; AVX512DQ-BW-NEXT: vpermw %zmm1 , %zmm4, %zmm4
660
+ ; AVX512DQ-BW-NEXT: vpermw %zmm2 , %zmm4, %zmm4
658
661
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
659
- ; AVX512DQ-BW-NEXT: vpermw %zmm1 , %zmm5, %zmm1
660
- ; AVX512DQ-BW-NEXT: vmovq %xmm2 , (%rsi)
662
+ ; AVX512DQ-BW-NEXT: vpermw %zmm2 , %zmm5, %zmm2
663
+ ; AVX512DQ-BW-NEXT: vmovq %xmm1 , (%rsi)
661
664
; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rdx)
662
665
; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rcx)
663
666
; AVX512DQ-BW-NEXT: vmovq %xmm4, (%r8)
664
- ; AVX512DQ-BW-NEXT: vmovq %xmm1 , (%r9)
667
+ ; AVX512DQ-BW-NEXT: vmovq %xmm2 , (%r9)
665
668
; AVX512DQ-BW-NEXT: vzeroupper
666
669
; AVX512DQ-BW-NEXT: retq
667
670
;
668
671
; AVX512DQ-BW-FCP-LABEL: load_i16_stride5_vf4:
669
672
; AVX512DQ-BW-FCP: # %bb.0:
670
673
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0]
671
- ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
672
- ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1 , %zmm0 , %zmm0
674
+ ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1
675
+ ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1 , %ymm0 , %ymm0
673
676
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
674
- ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2
677
+ ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm2, %ymm1
678
+ ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
675
679
; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm3
676
680
; AVX512DQ-BW-FCP-NEXT: vpextrw $7, %xmm3, %eax
677
- ; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, %eax, %xmm2 , %xmm2
681
+ ; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, %eax, %xmm1 , %xmm1
678
682
; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0
679
683
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
680
- ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1 , %zmm3, %zmm3
684
+ ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2 , %zmm3, %zmm3
681
685
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
682
- ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1 , %zmm4, %zmm4
686
+ ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2 , %zmm4, %zmm4
683
687
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
684
- ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1 , %zmm5, %zmm1
685
- ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2 , (%rsi)
688
+ ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2 , %zmm5, %zmm2
689
+ ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1 , (%rsi)
686
690
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rdx)
687
691
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx)
688
692
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%r8)
689
- ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1 , (%r9)
693
+ ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2 , (%r9)
690
694
; AVX512DQ-BW-FCP-NEXT: vzeroupper
691
695
; AVX512DQ-BW-FCP-NEXT: retq
692
696
%wide.vec = load <20 x i16>, ptr %in.vec, align 64
0 commit comments