@@ -593,100 +593,104 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
593593; AVX512BW-LABEL: load_i16_stride5_vf4:
594594; AVX512BW: # %bb.0:
595595; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0]
596- ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1
597- ; AVX512BW-NEXT: vpermw %zmm1 , %zmm0 , %zmm0
596+ ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
597+ ; AVX512BW-NEXT: vpermw %ymm1 , %ymm0 , %ymm0
598598; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
599- ; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm2
599+ ; AVX512BW-NEXT: vpermw %ymm1, %ymm2, %ymm1
600+ ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2
600601; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm3
601602; AVX512BW-NEXT: vpextrw $7, %xmm3, %eax
602- ; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm2 , %xmm2
603+ ; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm1 , %xmm1
603604; AVX512BW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0
604605; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
605- ; AVX512BW-NEXT: vpermw %zmm1 , %zmm3, %zmm3
606+ ; AVX512BW-NEXT: vpermw %zmm2 , %zmm3, %zmm3
606607; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
607- ; AVX512BW-NEXT: vpermw %zmm1 , %zmm4, %zmm4
608+ ; AVX512BW-NEXT: vpermw %zmm2 , %zmm4, %zmm4
608609; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
609- ; AVX512BW-NEXT: vpermw %zmm1 , %zmm5, %zmm1
610- ; AVX512BW-NEXT: vmovq %xmm2 , (%rsi)
610+ ; AVX512BW-NEXT: vpermw %zmm2 , %zmm5, %zmm2
611+ ; AVX512BW-NEXT: vmovq %xmm1 , (%rsi)
611612; AVX512BW-NEXT: vmovq %xmm0, (%rdx)
612613; AVX512BW-NEXT: vmovq %xmm3, (%rcx)
613614; AVX512BW-NEXT: vmovq %xmm4, (%r8)
614- ; AVX512BW-NEXT: vmovq %xmm1 , (%r9)
615+ ; AVX512BW-NEXT: vmovq %xmm2 , (%r9)
615616; AVX512BW-NEXT: vzeroupper
616617; AVX512BW-NEXT: retq
617618;
618619; AVX512BW-FCP-LABEL: load_i16_stride5_vf4:
619620; AVX512BW-FCP: # %bb.0:
620621; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0]
621- ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
622- ; AVX512BW-FCP-NEXT: vpermw %zmm1 , %zmm0 , %zmm0
622+ ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1
623+ ; AVX512BW-FCP-NEXT: vpermw %ymm1 , %ymm0 , %ymm0
623624; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
624- ; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2
625+ ; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm2, %ymm1
626+ ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
625627; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm3
626628; AVX512BW-FCP-NEXT: vpextrw $7, %xmm3, %eax
627- ; AVX512BW-FCP-NEXT: vpinsrw $3, %eax, %xmm2 , %xmm2
629+ ; AVX512BW-FCP-NEXT: vpinsrw $3, %eax, %xmm1 , %xmm1
628630; AVX512BW-FCP-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0
629631; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
630- ; AVX512BW-FCP-NEXT: vpermw %zmm1 , %zmm3, %zmm3
632+ ; AVX512BW-FCP-NEXT: vpermw %zmm2 , %zmm3, %zmm3
631633; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
632- ; AVX512BW-FCP-NEXT: vpermw %zmm1 , %zmm4, %zmm4
634+ ; AVX512BW-FCP-NEXT: vpermw %zmm2 , %zmm4, %zmm4
633635; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
634- ; AVX512BW-FCP-NEXT: vpermw %zmm1 , %zmm5, %zmm1
635- ; AVX512BW-FCP-NEXT: vmovq %xmm2 , (%rsi)
636+ ; AVX512BW-FCP-NEXT: vpermw %zmm2 , %zmm5, %zmm2
637+ ; AVX512BW-FCP-NEXT: vmovq %xmm1 , (%rsi)
636638; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rdx)
637639; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx)
638640; AVX512BW-FCP-NEXT: vmovq %xmm4, (%r8)
639- ; AVX512BW-FCP-NEXT: vmovq %xmm1 , (%r9)
641+ ; AVX512BW-FCP-NEXT: vmovq %xmm2 , (%r9)
640642; AVX512BW-FCP-NEXT: vzeroupper
641643; AVX512BW-FCP-NEXT: retq
642644;
643645; AVX512DQ-BW-LABEL: load_i16_stride5_vf4:
644646; AVX512DQ-BW: # %bb.0:
645647; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0]
646- ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1
647- ; AVX512DQ-BW-NEXT: vpermw %zmm1 , %zmm0 , %zmm0
648+ ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm1
649+ ; AVX512DQ-BW-NEXT: vpermw %ymm1 , %ymm0 , %ymm0
648650; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
649- ; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm2
651+ ; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm2, %ymm1
652+ ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2
650653; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm3
651654; AVX512DQ-BW-NEXT: vpextrw $7, %xmm3, %eax
652- ; AVX512DQ-BW-NEXT: vpinsrw $3, %eax, %xmm2 , %xmm2
655+ ; AVX512DQ-BW-NEXT: vpinsrw $3, %eax, %xmm1 , %xmm1
653656; AVX512DQ-BW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0
654657; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
655- ; AVX512DQ-BW-NEXT: vpermw %zmm1 , %zmm3, %zmm3
658+ ; AVX512DQ-BW-NEXT: vpermw %zmm2 , %zmm3, %zmm3
656659; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
657- ; AVX512DQ-BW-NEXT: vpermw %zmm1 , %zmm4, %zmm4
660+ ; AVX512DQ-BW-NEXT: vpermw %zmm2 , %zmm4, %zmm4
658661; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
659- ; AVX512DQ-BW-NEXT: vpermw %zmm1 , %zmm5, %zmm1
660- ; AVX512DQ-BW-NEXT: vmovq %xmm2 , (%rsi)
662+ ; AVX512DQ-BW-NEXT: vpermw %zmm2 , %zmm5, %zmm2
663+ ; AVX512DQ-BW-NEXT: vmovq %xmm1 , (%rsi)
661664; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rdx)
662665; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rcx)
663666; AVX512DQ-BW-NEXT: vmovq %xmm4, (%r8)
664- ; AVX512DQ-BW-NEXT: vmovq %xmm1 , (%r9)
667+ ; AVX512DQ-BW-NEXT: vmovq %xmm2 , (%r9)
665668; AVX512DQ-BW-NEXT: vzeroupper
666669; AVX512DQ-BW-NEXT: retq
667670;
668671; AVX512DQ-BW-FCP-LABEL: load_i16_stride5_vf4:
669672; AVX512DQ-BW-FCP: # %bb.0:
670673; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0]
671- ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
672- ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1 , %zmm0 , %zmm0
674+ ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1
675+ ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1 , %ymm0 , %ymm0
673676; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
674- ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2
677+ ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm2, %ymm1
678+ ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
675679; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm3
676680; AVX512DQ-BW-FCP-NEXT: vpextrw $7, %xmm3, %eax
677- ; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, %eax, %xmm2 , %xmm2
681+ ; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, %eax, %xmm1 , %xmm1
678682; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0
679683; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
680- ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1 , %zmm3, %zmm3
684+ ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2 , %zmm3, %zmm3
681685; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
682- ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1 , %zmm4, %zmm4
686+ ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2 , %zmm4, %zmm4
683687; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
684- ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1 , %zmm5, %zmm1
685- ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2 , (%rsi)
688+ ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2 , %zmm5, %zmm2
689+ ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1 , (%rsi)
686690; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rdx)
687691; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx)
688692; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%r8)
689- ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1 , (%r9)
693+ ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2 , (%r9)
690694; AVX512DQ-BW-FCP-NEXT: vzeroupper
691695; AVX512DQ-BW-FCP-NEXT: retq
692696 %wide.vec = load <20 x i16>, ptr %in.vec, align 64
0 commit comments