@@ -639,7 +639,23 @@ define <16 x float> @test14(ptr %base, i32 %ind, <16 x ptr> %vec) {
639639; SKX_32-NEXT: vpxor %xmm0, %xmm0, %xmm0
640640; SKX_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
641641; SKX_32-NEXT: retl
642-
642+ ; X64-LABEL: test14:
643+ ; X64: # %bb.0:
644+ ; X64-NEXT: vmovq %xmm0, %rax
645+ ; X64-NEXT: vpbroadcastd %esi, %zmm1
646+ ; X64-NEXT: kxnorw %k0, %k0, %k1
647+ ; X64-NEXT: vpxor %xmm0, %xmm0, %xmm0
648+ ; X64-NEXT: vgatherdps (%rax,%zmm1,4), %zmm0 {%k1}
649+ ; X64-NEXT: retq
650+ ;
651+ ; X86-LABEL: test14:
652+ ; X86: # %bb.0:
653+ ; X86-NEXT: vmovd %xmm0, %eax
654+ ; X86-NEXT: vbroadcastss {{[0-9]+}}(%esp), %zmm1
655+ ; X86-NEXT: kxnorw %k0, %k0, %k1
656+ ; X86-NEXT: vpxor %xmm0, %xmm0, %xmm0
657+ ; X86-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
658+ ; X86-NEXT: retl
643659 %broadcast.splatinsert = insertelement <16 x ptr > %vec , ptr %base , i32 1
644660 %broadcast.splat = shufflevector <16 x ptr > %broadcast.splatinsert , <16 x ptr > undef , <16 x i32 > zeroinitializer
645661
@@ -4826,16 +4842,9 @@ define <16 x float> @test_gather_structpt_16f32_mask_index(ptr %x, ptr %arr, <16
48264842; X64-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
48274843; X64-KNL-NEXT: vmovdqu64 (%rsi), %zmm0
48284844; X64-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
4829- ; X64-KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
4830- ; X64-KNL-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
4831- ; X64-KNL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
4832- ; X64-KNL-NEXT: vpsllq $4, %zmm0, %zmm0
4833- ; X64-KNL-NEXT: vpsllq $4, %zmm2, %zmm2
4834- ; X64-KNL-NEXT: vextractf64x4 $1, %zmm1, %ymm3
4835- ; X64-KNL-NEXT: kshiftrw $8, %k1, %k2
4836- ; X64-KNL-NEXT: vgatherqps (%rdi,%zmm2), %ymm3 {%k2}
4837- ; X64-KNL-NEXT: vgatherqps (%rdi,%zmm0), %ymm1 {%k1}
4838- ; X64-KNL-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0
4845+ ; X64-KNL-NEXT: vpaddd %zmm0, %zmm0, %zmm0
4846+ ; X64-KNL-NEXT: vgatherdps (%rdi,%zmm0,8), %zmm1 {%k1}
4847+ ; X64-KNL-NEXT: vmovaps %zmm1, %zmm0
48394848; X64-KNL-NEXT: retq
48404849;
48414850; X86-KNL-LABEL: test_gather_structpt_16f32_mask_index:
@@ -4845,8 +4854,10 @@ define <16 x float> @test_gather_structpt_16f32_mask_index(ptr %x, ptr %arr, <16
48454854; X86-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
48464855; X86-KNL-NEXT: movl {{[0-9]+}}(%esp), %eax
48474856; X86-KNL-NEXT: movl {{[0-9]+}}(%esp), %ecx
4848- ; X86-KNL-NEXT: vpslld $4, (%ecx), %zmm0
4849- ; X86-KNL-NEXT: vgatherdps (%eax,%zmm0), %zmm1 {%k1}
4857+ ; X86-KNL-NEXT: vmovdqu64 (%ecx), %zmm0
4858+ ; X86-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm0
4859+ ; X86-KNL-NEXT: vpaddd %zmm0, %zmm0, %zmm0
4860+ ; X86-KNL-NEXT: vgatherdps (%eax,%zmm0,8), %zmm1 {%k1}
48504861; X86-KNL-NEXT: vmovaps %zmm1, %zmm0
48514862; X86-KNL-NEXT: retl
48524863;
@@ -4857,16 +4868,9 @@ define <16 x float> @test_gather_structpt_16f32_mask_index(ptr %x, ptr %arr, <16
48574868; X64-SKX-SMALL-NEXT: vpmovd2m %zmm0, %k1
48584869; X64-SKX-SMALL-NEXT: vmovdqu64 (%rsi), %zmm0
48594870; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
4860- ; X64-SKX-SMALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
4861- ; X64-SKX-SMALL-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
4862- ; X64-SKX-SMALL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
4863- ; X64-SKX-SMALL-NEXT: vpsllq $4, %zmm0, %zmm0
4864- ; X64-SKX-SMALL-NEXT: vpsllq $4, %zmm2, %zmm2
4865- ; X64-SKX-SMALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3
4866- ; X64-SKX-SMALL-NEXT: kshiftrw $8, %k1, %k2
4867- ; X64-SKX-SMALL-NEXT: vgatherqps (%rdi,%zmm2), %ymm3 {%k2}
4868- ; X64-SKX-SMALL-NEXT: vgatherqps (%rdi,%zmm0), %ymm1 {%k1}
4869- ; X64-SKX-SMALL-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0
4871+ ; X64-SKX-SMALL-NEXT: vpaddd %zmm0, %zmm0, %zmm0
4872+ ; X64-SKX-SMALL-NEXT: vgatherdps (%rdi,%zmm0,8), %zmm1 {%k1}
4873+ ; X64-SKX-SMALL-NEXT: vmovaps %zmm1, %zmm0
48704874; X64-SKX-SMALL-NEXT: retq
48714875;
48724876; X64-SKX-LARGE-LABEL: test_gather_structpt_16f32_mask_index:
@@ -4877,16 +4881,9 @@ define <16 x float> @test_gather_structpt_16f32_mask_index(ptr %x, ptr %arr, <16
48774881; X64-SKX-LARGE-NEXT: vmovdqu64 (%rsi), %zmm0
48784882; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
48794883; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0
4880- ; X64-SKX-LARGE-NEXT: vextracti64x4 $1, %zmm0, %ymm2
4881- ; X64-SKX-LARGE-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
4882- ; X64-SKX-LARGE-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
4883- ; X64-SKX-LARGE-NEXT: vpsllq $4, %zmm0, %zmm0
4884- ; X64-SKX-LARGE-NEXT: vpsllq $4, %zmm2, %zmm2
4885- ; X64-SKX-LARGE-NEXT: vextractf64x4 $1, %zmm1, %ymm3
4886- ; X64-SKX-LARGE-NEXT: kshiftrw $8, %k1, %k2
4887- ; X64-SKX-LARGE-NEXT: vgatherqps (%rdi,%zmm2), %ymm3 {%k2}
4888- ; X64-SKX-LARGE-NEXT: vgatherqps (%rdi,%zmm0), %ymm1 {%k1}
4889- ; X64-SKX-LARGE-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0
4884+ ; X64-SKX-LARGE-NEXT: vpaddd %zmm0, %zmm0, %zmm0
4885+ ; X64-SKX-LARGE-NEXT: vgatherdps (%rdi,%zmm0,8), %zmm1 {%k1}
4886+ ; X64-SKX-LARGE-NEXT: vmovaps %zmm1, %zmm0
48904887; X64-SKX-LARGE-NEXT: retq
48914888;
48924889; X86-SKX-LABEL: test_gather_structpt_16f32_mask_index:
@@ -4896,8 +4893,10 @@ define <16 x float> @test_gather_structpt_16f32_mask_index(ptr %x, ptr %arr, <16
48964893; X86-SKX-NEXT: vpmovd2m %zmm0, %k1
48974894; X86-SKX-NEXT: movl {{[0-9]+}}(%esp), %eax
48984895; X86-SKX-NEXT: movl {{[0-9]+}}(%esp), %ecx
4899- ; X86-SKX-NEXT: vpslld $4, (%ecx), %zmm0
4900- ; X86-SKX-NEXT: vgatherdps (%eax,%zmm0), %zmm1 {%k1}
4896+ ; X86-SKX-NEXT: vmovdqu64 (%ecx), %zmm0
4897+ ; X86-SKX-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm0
4898+ ; X86-SKX-NEXT: vpaddd %zmm0, %zmm0, %zmm0
4899+ ; X86-SKX-NEXT: vgatherdps (%eax,%zmm0,8), %zmm1 {%k1}
49014900; X86-SKX-NEXT: vmovaps %zmm1, %zmm0
49024901; X86-SKX-NEXT: retl
49034902 %wide.load = load <16 x i32 >, ptr %arr , align 4
@@ -4916,16 +4915,9 @@ define <16 x float> @test_gather_structpt_16f32_mask_index_offset(ptr %x, ptr %a
49164915; X64-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
49174916; X64-KNL-NEXT: vmovdqu64 (%rsi), %zmm0
49184917; X64-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
4919- ; X64-KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
4920- ; X64-KNL-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
4921- ; X64-KNL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
4922- ; X64-KNL-NEXT: vpsllq $4, %zmm0, %zmm0
4923- ; X64-KNL-NEXT: vpsllq $4, %zmm2, %zmm2
4924- ; X64-KNL-NEXT: vextractf64x4 $1, %zmm1, %ymm3
4925- ; X64-KNL-NEXT: kshiftrw $8, %k1, %k2
4926- ; X64-KNL-NEXT: vgatherqps 4(%rdi,%zmm2), %ymm3 {%k2}
4927- ; X64-KNL-NEXT: vgatherqps 4(%rdi,%zmm0), %ymm1 {%k1}
4928- ; X64-KNL-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0
4918+ ; X64-KNL-NEXT: vpaddd %zmm0, %zmm0, %zmm0
4919+ ; X64-KNL-NEXT: vgatherdps 4(%rdi,%zmm0,8), %zmm1 {%k1}
4920+ ; X64-KNL-NEXT: vmovaps %zmm1, %zmm0
49294921; X64-KNL-NEXT: retq
49304922;
49314923; X86-KNL-LABEL: test_gather_structpt_16f32_mask_index_offset:
@@ -4935,8 +4927,10 @@ define <16 x float> @test_gather_structpt_16f32_mask_index_offset(ptr %x, ptr %a
49354927; X86-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
49364928; X86-KNL-NEXT: movl {{[0-9]+}}(%esp), %eax
49374929; X86-KNL-NEXT: movl {{[0-9]+}}(%esp), %ecx
4938- ; X86-KNL-NEXT: vpslld $4, (%ecx), %zmm0
4939- ; X86-KNL-NEXT: vgatherdps 4(%eax,%zmm0), %zmm1 {%k1}
4930+ ; X86-KNL-NEXT: vmovdqu64 (%ecx), %zmm0
4931+ ; X86-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm0
4932+ ; X86-KNL-NEXT: vpaddd %zmm0, %zmm0, %zmm0
4933+ ; X86-KNL-NEXT: vgatherdps 4(%eax,%zmm0,8), %zmm1 {%k1}
49404934; X86-KNL-NEXT: vmovaps %zmm1, %zmm0
49414935; X86-KNL-NEXT: retl
49424936;
@@ -4947,16 +4941,9 @@ define <16 x float> @test_gather_structpt_16f32_mask_index_offset(ptr %x, ptr %a
49474941; X64-SKX-SMALL-NEXT: vpmovd2m %zmm0, %k1
49484942; X64-SKX-SMALL-NEXT: vmovdqu64 (%rsi), %zmm0
49494943; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
4950- ; X64-SKX-SMALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
4951- ; X64-SKX-SMALL-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
4952- ; X64-SKX-SMALL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
4953- ; X64-SKX-SMALL-NEXT: vpsllq $4, %zmm0, %zmm0
4954- ; X64-SKX-SMALL-NEXT: vpsllq $4, %zmm2, %zmm2
4955- ; X64-SKX-SMALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3
4956- ; X64-SKX-SMALL-NEXT: kshiftrw $8, %k1, %k2
4957- ; X64-SKX-SMALL-NEXT: vgatherqps 4(%rdi,%zmm2), %ymm3 {%k2}
4958- ; X64-SKX-SMALL-NEXT: vgatherqps 4(%rdi,%zmm0), %ymm1 {%k1}
4959- ; X64-SKX-SMALL-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0
4944+ ; X64-SKX-SMALL-NEXT: vpaddd %zmm0, %zmm0, %zmm0
4945+ ; X64-SKX-SMALL-NEXT: vgatherdps 4(%rdi,%zmm0,8), %zmm1 {%k1}
4946+ ; X64-SKX-SMALL-NEXT: vmovaps %zmm1, %zmm0
49604947; X64-SKX-SMALL-NEXT: retq
49614948;
49624949; X64-SKX-LARGE-LABEL: test_gather_structpt_16f32_mask_index_offset:
@@ -4967,16 +4954,9 @@ define <16 x float> @test_gather_structpt_16f32_mask_index_offset(ptr %x, ptr %a
49674954; X64-SKX-LARGE-NEXT: vmovdqu64 (%rsi), %zmm0
49684955; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
49694956; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0
4970- ; X64-SKX-LARGE-NEXT: vextracti64x4 $1, %zmm0, %ymm2
4971- ; X64-SKX-LARGE-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
4972- ; X64-SKX-LARGE-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
4973- ; X64-SKX-LARGE-NEXT: vpsllq $4, %zmm0, %zmm0
4974- ; X64-SKX-LARGE-NEXT: vpsllq $4, %zmm2, %zmm2
4975- ; X64-SKX-LARGE-NEXT: vextractf64x4 $1, %zmm1, %ymm3
4976- ; X64-SKX-LARGE-NEXT: kshiftrw $8, %k1, %k2
4977- ; X64-SKX-LARGE-NEXT: vgatherqps 4(%rdi,%zmm2), %ymm3 {%k2}
4978- ; X64-SKX-LARGE-NEXT: vgatherqps 4(%rdi,%zmm0), %ymm1 {%k1}
4979- ; X64-SKX-LARGE-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0
4957+ ; X64-SKX-LARGE-NEXT: vpaddd %zmm0, %zmm0, %zmm0
4958+ ; X64-SKX-LARGE-NEXT: vgatherdps 4(%rdi,%zmm0,8), %zmm1 {%k1}
4959+ ; X64-SKX-LARGE-NEXT: vmovaps %zmm1, %zmm0
49804960; X64-SKX-LARGE-NEXT: retq
49814961;
49824962; X86-SKX-LABEL: test_gather_structpt_16f32_mask_index_offset:
@@ -4986,8 +4966,10 @@ define <16 x float> @test_gather_structpt_16f32_mask_index_offset(ptr %x, ptr %a
49864966; X86-SKX-NEXT: vpmovd2m %zmm0, %k1
49874967; X86-SKX-NEXT: movl {{[0-9]+}}(%esp), %eax
49884968; X86-SKX-NEXT: movl {{[0-9]+}}(%esp), %ecx
4989- ; X86-SKX-NEXT: vpslld $4, (%ecx), %zmm0
4990- ; X86-SKX-NEXT: vgatherdps 4(%eax,%zmm0), %zmm1 {%k1}
4969+ ; X86-SKX-NEXT: vmovdqu64 (%ecx), %zmm0
4970+ ; X86-SKX-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm0
4971+ ; X86-SKX-NEXT: vpaddd %zmm0, %zmm0, %zmm0
4972+ ; X86-SKX-NEXT: vgatherdps 4(%eax,%zmm0,8), %zmm1 {%k1}
49914973; X86-SKX-NEXT: vmovaps %zmm1, %zmm0
49924974; X86-SKX-NEXT: retl
49934975 %wide.load = load <16 x i32 >, ptr %arr , align 4
@@ -5006,23 +4988,11 @@ define {<16 x float>, <16 x float>} @test_gather_16f32_mask_index_pair(ptr %x, p
50064988; X64-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
50074989; X64-KNL-NEXT: vmovdqu64 (%rsi), %zmm0
50084990; X64-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
5009- ; X64-KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
5010- ; X64-KNL-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
5011- ; X64-KNL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
5012- ; X64-KNL-NEXT: vpsllq $4, %zmm0, %zmm3
5013- ; X64-KNL-NEXT: vpsllq $4, %zmm2, %zmm2
5014- ; X64-KNL-NEXT: vextractf64x4 $1, %zmm1, %ymm4
5015- ; X64-KNL-NEXT: kshiftrw $8, %k1, %k2
5016- ; X64-KNL-NEXT: kmovw %k2, %k3
5017- ; X64-KNL-NEXT: vmovaps %ymm4, %ymm0
5018- ; X64-KNL-NEXT: vgatherqps (%rdi,%zmm2), %ymm0 {%k3}
5019- ; X64-KNL-NEXT: vmovaps %ymm1, %ymm5
5020- ; X64-KNL-NEXT: kmovw %k1, %k3
5021- ; X64-KNL-NEXT: vgatherqps (%rdi,%zmm3), %ymm5 {%k3}
5022- ; X64-KNL-NEXT: vinsertf64x4 $1, %ymm0, %zmm5, %zmm0
5023- ; X64-KNL-NEXT: vgatherqps 4(%rdi,%zmm2), %ymm4 {%k2}
5024- ; X64-KNL-NEXT: vgatherqps 4(%rdi,%zmm3), %ymm1 {%k1}
5025- ; X64-KNL-NEXT: vinsertf64x4 $1, %ymm4, %zmm1, %zmm1
4991+ ; X64-KNL-NEXT: vpaddd %zmm0, %zmm0, %zmm2
4992+ ; X64-KNL-NEXT: kmovw %k1, %k2
4993+ ; X64-KNL-NEXT: vmovaps %zmm1, %zmm0
4994+ ; X64-KNL-NEXT: vgatherdps (%rdi,%zmm2,8), %zmm0 {%k2}
4995+ ; X64-KNL-NEXT: vgatherdps 4(%rdi,%zmm2,8), %zmm1 {%k1}
50264996; X64-KNL-NEXT: retq
50274997;
50284998; X86-KNL-LABEL: test_gather_16f32_mask_index_pair:
@@ -5032,11 +5002,13 @@ define {<16 x float>, <16 x float>} @test_gather_16f32_mask_index_pair(ptr %x, p
50325002; X86-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
50335003; X86-KNL-NEXT: movl {{[0-9]+}}(%esp), %eax
50345004; X86-KNL-NEXT: movl {{[0-9]+}}(%esp), %ecx
5035- ; X86-KNL-NEXT: vpslld $4, (%ecx), %zmm2
5005+ ; X86-KNL-NEXT: vmovdqu64 (%ecx), %zmm0
5006+ ; X86-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm0
5007+ ; X86-KNL-NEXT: vpaddd %zmm0, %zmm0, %zmm2
50365008; X86-KNL-NEXT: kmovw %k1, %k2
50375009; X86-KNL-NEXT: vmovaps %zmm1, %zmm0
5038- ; X86-KNL-NEXT: vgatherdps (%eax,%zmm2), %zmm0 {%k2}
5039- ; X86-KNL-NEXT: vgatherdps 4(%eax,%zmm2), %zmm1 {%k1}
5010+ ; X86-KNL-NEXT: vgatherdps (%eax,%zmm2,8 ), %zmm0 {%k2}
5011+ ; X86-KNL-NEXT: vgatherdps 4(%eax,%zmm2,8 ), %zmm1 {%k1}
50405012; X86-KNL-NEXT: retl
50415013;
50425014; X64-SKX-SMALL-LABEL: test_gather_16f32_mask_index_pair:
@@ -5046,23 +5018,11 @@ define {<16 x float>, <16 x float>} @test_gather_16f32_mask_index_pair(ptr %x, p
50465018; X64-SKX-SMALL-NEXT: vpmovd2m %zmm0, %k1
50475019; X64-SKX-SMALL-NEXT: vmovdqu64 (%rsi), %zmm0
50485020; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
5049- ; X64-SKX-SMALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
5050- ; X64-SKX-SMALL-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
5051- ; X64-SKX-SMALL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
5052- ; X64-SKX-SMALL-NEXT: vpsllq $4, %zmm0, %zmm3
5053- ; X64-SKX-SMALL-NEXT: vpsllq $4, %zmm2, %zmm2
5054- ; X64-SKX-SMALL-NEXT: vextractf64x4 $1, %zmm1, %ymm4
5055- ; X64-SKX-SMALL-NEXT: kshiftrw $8, %k1, %k2
5056- ; X64-SKX-SMALL-NEXT: kmovw %k2, %k3
5057- ; X64-SKX-SMALL-NEXT: vmovaps %ymm4, %ymm0
5058- ; X64-SKX-SMALL-NEXT: vgatherqps (%rdi,%zmm2), %ymm0 {%k3}
5059- ; X64-SKX-SMALL-NEXT: vmovaps %ymm1, %ymm5
5060- ; X64-SKX-SMALL-NEXT: kmovw %k1, %k3
5061- ; X64-SKX-SMALL-NEXT: vgatherqps (%rdi,%zmm3), %ymm5 {%k3}
5062- ; X64-SKX-SMALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm5, %zmm0
5063- ; X64-SKX-SMALL-NEXT: vgatherqps 4(%rdi,%zmm2), %ymm4 {%k2}
5064- ; X64-SKX-SMALL-NEXT: vgatherqps 4(%rdi,%zmm3), %ymm1 {%k1}
5065- ; X64-SKX-SMALL-NEXT: vinsertf64x4 $1, %ymm4, %zmm1, %zmm1
5021+ ; X64-SKX-SMALL-NEXT: vpaddd %zmm0, %zmm0, %zmm2
5022+ ; X64-SKX-SMALL-NEXT: kmovw %k1, %k2
5023+ ; X64-SKX-SMALL-NEXT: vmovaps %zmm1, %zmm0
5024+ ; X64-SKX-SMALL-NEXT: vgatherdps (%rdi,%zmm2,8), %zmm0 {%k2}
5025+ ; X64-SKX-SMALL-NEXT: vgatherdps 4(%rdi,%zmm2,8), %zmm1 {%k1}
50665026; X64-SKX-SMALL-NEXT: retq
50675027;
50685028; X64-SKX-LARGE-LABEL: test_gather_16f32_mask_index_pair:
@@ -5073,23 +5033,11 @@ define {<16 x float>, <16 x float>} @test_gather_16f32_mask_index_pair(ptr %x, p
50735033; X64-SKX-LARGE-NEXT: vmovdqu64 (%rsi), %zmm0
50745034; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
50755035; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0
5076- ; X64-SKX-LARGE-NEXT: vextracti64x4 $1, %zmm0, %ymm2
5077- ; X64-SKX-LARGE-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
5078- ; X64-SKX-LARGE-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
5079- ; X64-SKX-LARGE-NEXT: vpsllq $4, %zmm0, %zmm3
5080- ; X64-SKX-LARGE-NEXT: vpsllq $4, %zmm2, %zmm2
5081- ; X64-SKX-LARGE-NEXT: vextractf64x4 $1, %zmm1, %ymm4
5082- ; X64-SKX-LARGE-NEXT: kshiftrw $8, %k1, %k2
5083- ; X64-SKX-LARGE-NEXT: vmovaps %ymm4, %ymm0
5084- ; X64-SKX-LARGE-NEXT: kmovw %k2, %k3
5085- ; X64-SKX-LARGE-NEXT: vgatherqps (%rdi,%zmm2), %ymm0 {%k3}
5086- ; X64-SKX-LARGE-NEXT: vmovaps %ymm1, %ymm5
5087- ; X64-SKX-LARGE-NEXT: kmovw %k1, %k3
5088- ; X64-SKX-LARGE-NEXT: vgatherqps (%rdi,%zmm3), %ymm5 {%k3}
5089- ; X64-SKX-LARGE-NEXT: vinsertf64x4 $1, %ymm0, %zmm5, %zmm0
5090- ; X64-SKX-LARGE-NEXT: vgatherqps 4(%rdi,%zmm2), %ymm4 {%k2}
5091- ; X64-SKX-LARGE-NEXT: vgatherqps 4(%rdi,%zmm3), %ymm1 {%k1}
5092- ; X64-SKX-LARGE-NEXT: vinsertf64x4 $1, %ymm4, %zmm1, %zmm1
5036+ ; X64-SKX-LARGE-NEXT: vpaddd %zmm0, %zmm0, %zmm2
5037+ ; X64-SKX-LARGE-NEXT: kmovw %k1, %k2
5038+ ; X64-SKX-LARGE-NEXT: vmovaps %zmm1, %zmm0
5039+ ; X64-SKX-LARGE-NEXT: vgatherdps (%rdi,%zmm2,8), %zmm0 {%k2}
5040+ ; X64-SKX-LARGE-NEXT: vgatherdps 4(%rdi,%zmm2,8), %zmm1 {%k1}
50935041; X64-SKX-LARGE-NEXT: retq
50945042;
50955043; X86-SKX-LABEL: test_gather_16f32_mask_index_pair:
@@ -5099,11 +5047,13 @@ define {<16 x float>, <16 x float>} @test_gather_16f32_mask_index_pair(ptr %x, p
50995047; X86-SKX-NEXT: vpmovd2m %zmm0, %k1
51005048; X86-SKX-NEXT: movl {{[0-9]+}}(%esp), %eax
51015049; X86-SKX-NEXT: movl {{[0-9]+}}(%esp), %ecx
5102- ; X86-SKX-NEXT: vpslld $4, (%ecx), %zmm2
5050+ ; X86-SKX-NEXT: vmovdqu64 (%ecx), %zmm0
5051+ ; X86-SKX-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm0
5052+ ; X86-SKX-NEXT: vpaddd %zmm0, %zmm0, %zmm2
51035053; X86-SKX-NEXT: kmovw %k1, %k2
51045054; X86-SKX-NEXT: vmovaps %zmm1, %zmm0
5105- ; X86-SKX-NEXT: vgatherdps (%eax,%zmm2), %zmm0 {%k2}
5106- ; X86-SKX-NEXT: vgatherdps 4(%eax,%zmm2), %zmm1 {%k1}
5055+ ; X86-SKX-NEXT: vgatherdps (%eax,%zmm2,8 ), %zmm0 {%k2}
5056+ ; X86-SKX-NEXT: vgatherdps 4(%eax,%zmm2,8 ), %zmm1 {%k1}
51075057; X86-SKX-NEXT: retl
51085058 %wide.load = load <16 x i32 >, ptr %arr , align 4
51095059 %and = and <16 x i32 > %wide.load , <i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 >
0 commit comments