@@ -593,104 +593,100 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
593593; AVX512BW-LABEL: load_i16_stride5_vf4:
594594; AVX512BW: # %bb.0:
595595; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0]
596- ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
597- ; AVX512BW-NEXT: vpermw %ymm1 , %ymm0 , %ymm0
596+ ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1
597+ ; AVX512BW-NEXT: vpermw %zmm1 , %zmm0 , %zmm0
598598; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
599- ; AVX512BW-NEXT: vpermw %ymm1, %ymm2, %ymm1
600- ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2
599+ ; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm2
601600; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm3
602601; AVX512BW-NEXT: vpextrw $7, %xmm3, %eax
603- ; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm1 , %xmm1
602+ ; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm2 , %xmm2
604603; AVX512BW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0
605604; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
606- ; AVX512BW-NEXT: vpermw %zmm2 , %zmm3, %zmm3
605+ ; AVX512BW-NEXT: vpermw %zmm1 , %zmm3, %zmm3
607606; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
608- ; AVX512BW-NEXT: vpermw %zmm2 , %zmm4, %zmm4
607+ ; AVX512BW-NEXT: vpermw %zmm1 , %zmm4, %zmm4
609608; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
610- ; AVX512BW-NEXT: vpermw %zmm2 , %zmm5, %zmm2
611- ; AVX512BW-NEXT: vmovq %xmm1 , (%rsi)
609+ ; AVX512BW-NEXT: vpermw %zmm1 , %zmm5, %zmm1
610+ ; AVX512BW-NEXT: vmovq %xmm2 , (%rsi)
612611; AVX512BW-NEXT: vmovq %xmm0, (%rdx)
613612; AVX512BW-NEXT: vmovq %xmm3, (%rcx)
614613; AVX512BW-NEXT: vmovq %xmm4, (%r8)
615- ; AVX512BW-NEXT: vmovq %xmm2 , (%r9)
614+ ; AVX512BW-NEXT: vmovq %xmm1 , (%r9)
616615; AVX512BW-NEXT: vzeroupper
617616; AVX512BW-NEXT: retq
618617;
619618; AVX512BW-FCP-LABEL: load_i16_stride5_vf4:
620619; AVX512BW-FCP: # %bb.0:
621620; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0]
622- ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1
623- ; AVX512BW-FCP-NEXT: vpermw %ymm1 , %ymm0 , %ymm0
621+ ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
622+ ; AVX512BW-FCP-NEXT: vpermw %zmm1 , %zmm0 , %zmm0
624623; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
625- ; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm2, %ymm1
626- ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
624+ ; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2
627625; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm3
628626; AVX512BW-FCP-NEXT: vpextrw $7, %xmm3, %eax
629- ; AVX512BW-FCP-NEXT: vpinsrw $3, %eax, %xmm1 , %xmm1
627+ ; AVX512BW-FCP-NEXT: vpinsrw $3, %eax, %xmm2 , %xmm2
630628; AVX512BW-FCP-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0
631629; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
632- ; AVX512BW-FCP-NEXT: vpermw %zmm2 , %zmm3, %zmm3
630+ ; AVX512BW-FCP-NEXT: vpermw %zmm1 , %zmm3, %zmm3
633631; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
634- ; AVX512BW-FCP-NEXT: vpermw %zmm2 , %zmm4, %zmm4
632+ ; AVX512BW-FCP-NEXT: vpermw %zmm1 , %zmm4, %zmm4
635633; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
636- ; AVX512BW-FCP-NEXT: vpermw %zmm2 , %zmm5, %zmm2
637- ; AVX512BW-FCP-NEXT: vmovq %xmm1 , (%rsi)
634+ ; AVX512BW-FCP-NEXT: vpermw %zmm1 , %zmm5, %zmm1
635+ ; AVX512BW-FCP-NEXT: vmovq %xmm2 , (%rsi)
638636; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rdx)
639637; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx)
640638; AVX512BW-FCP-NEXT: vmovq %xmm4, (%r8)
641- ; AVX512BW-FCP-NEXT: vmovq %xmm2 , (%r9)
639+ ; AVX512BW-FCP-NEXT: vmovq %xmm1 , (%r9)
642640; AVX512BW-FCP-NEXT: vzeroupper
643641; AVX512BW-FCP-NEXT: retq
644642;
645643; AVX512DQ-BW-LABEL: load_i16_stride5_vf4:
646644; AVX512DQ-BW: # %bb.0:
647645; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0]
648- ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm1
649- ; AVX512DQ-BW-NEXT: vpermw %ymm1 , %ymm0 , %ymm0
646+ ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1
647+ ; AVX512DQ-BW-NEXT: vpermw %zmm1 , %zmm0 , %zmm0
650648; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
651- ; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm2, %ymm1
652- ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2
649+ ; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm2
653650; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm3
654651; AVX512DQ-BW-NEXT: vpextrw $7, %xmm3, %eax
655- ; AVX512DQ-BW-NEXT: vpinsrw $3, %eax, %xmm1 , %xmm1
652+ ; AVX512DQ-BW-NEXT: vpinsrw $3, %eax, %xmm2 , %xmm2
656653; AVX512DQ-BW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0
657654; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
658- ; AVX512DQ-BW-NEXT: vpermw %zmm2 , %zmm3, %zmm3
655+ ; AVX512DQ-BW-NEXT: vpermw %zmm1 , %zmm3, %zmm3
659656; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
660- ; AVX512DQ-BW-NEXT: vpermw %zmm2 , %zmm4, %zmm4
657+ ; AVX512DQ-BW-NEXT: vpermw %zmm1 , %zmm4, %zmm4
661658; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
662- ; AVX512DQ-BW-NEXT: vpermw %zmm2 , %zmm5, %zmm2
663- ; AVX512DQ-BW-NEXT: vmovq %xmm1 , (%rsi)
659+ ; AVX512DQ-BW-NEXT: vpermw %zmm1 , %zmm5, %zmm1
660+ ; AVX512DQ-BW-NEXT: vmovq %xmm2 , (%rsi)
664661; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rdx)
665662; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rcx)
666663; AVX512DQ-BW-NEXT: vmovq %xmm4, (%r8)
667- ; AVX512DQ-BW-NEXT: vmovq %xmm2 , (%r9)
664+ ; AVX512DQ-BW-NEXT: vmovq %xmm1 , (%r9)
668665; AVX512DQ-BW-NEXT: vzeroupper
669666; AVX512DQ-BW-NEXT: retq
670667;
671668; AVX512DQ-BW-FCP-LABEL: load_i16_stride5_vf4:
672669; AVX512DQ-BW-FCP: # %bb.0:
673670; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0]
674- ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1
675- ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1 , %ymm0 , %ymm0
671+ ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
672+ ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1 , %zmm0 , %zmm0
676673; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
677- ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm2, %ymm1
678- ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
674+ ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2
679675; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm3
680676; AVX512DQ-BW-FCP-NEXT: vpextrw $7, %xmm3, %eax
681- ; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, %eax, %xmm1 , %xmm1
677+ ; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, %eax, %xmm2 , %xmm2
682678; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0
683679; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
684- ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2 , %zmm3, %zmm3
680+ ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1 , %zmm3, %zmm3
685681; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
686- ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2 , %zmm4, %zmm4
682+ ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1 , %zmm4, %zmm4
687683; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
688- ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2 , %zmm5, %zmm2
689- ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1 , (%rsi)
684+ ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1 , %zmm5, %zmm1
685+ ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2 , (%rsi)
690686; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rdx)
691687; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx)
692688; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%r8)
693- ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2 , (%r9)
689+ ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1 , (%r9)
694690; AVX512DQ-BW-FCP-NEXT: vzeroupper
695691; AVX512DQ-BW-FCP-NEXT: retq
696692 %wide.vec = load <20 x i16>, ptr %in.vec, align 64
0 commit comments