@@ -5245,3 +5245,309 @@ bb:
52455245 ret <8 x i64 > %tmp1
52465246}
52475247declare <8 x i64 > @llvm.masked.gather.v8i64.v8p0 (<8 x ptr >, i32 , <8 x i1 >, <8 x i64 >)
5248+
5249+ ; Test gathers from struct
5250+ %struct.pt = type { float , float , float , i32 }
5251+
5252+ define <16 x float > @test_gather_structpt_16f32_mask_index (ptr %x , ptr %arr , <16 x i1 > %mask , <16 x float > %src0 ) {
5253+ ; KNL_64-LABEL: test_gather_structpt_16f32_mask_index:
5254+ ; KNL_64: # %bb.0:
5255+ ; KNL_64-NEXT: vpmovsxbd %xmm0, %zmm0
5256+ ; KNL_64-NEXT: vpslld $31, %zmm0, %zmm0
5257+ ; KNL_64-NEXT: vptestmd %zmm0, %zmm0, %k1
5258+ ; KNL_64-NEXT: vmovdqu64 (%rsi), %zmm0
5259+ ; KNL_64-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
5260+ ; KNL_64-NEXT: vextracti64x4 $1, %zmm0, %ymm2
5261+ ; KNL_64-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
5262+ ; KNL_64-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
5263+ ; KNL_64-NEXT: vpsllq $4, %zmm0, %zmm0
5264+ ; KNL_64-NEXT: vpsllq $4, %zmm2, %zmm2
5265+ ; KNL_64-NEXT: vextractf64x4 $1, %zmm1, %ymm3
5266+ ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
5267+ ; KNL_64-NEXT: vgatherqps (%rdi,%zmm2), %ymm3 {%k2}
5268+ ; KNL_64-NEXT: vgatherqps (%rdi,%zmm0), %ymm1 {%k1}
5269+ ; KNL_64-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0
5270+ ; KNL_64-NEXT: retq
5271+ ;
5272+ ; KNL_32-LABEL: test_gather_structpt_16f32_mask_index:
5273+ ; KNL_32: # %bb.0:
5274+ ; KNL_32-NEXT: vpmovsxbd %xmm0, %zmm0
5275+ ; KNL_32-NEXT: vpslld $31, %zmm0, %zmm0
5276+ ; KNL_32-NEXT: vptestmd %zmm0, %zmm0, %k1
5277+ ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
5278+ ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %ecx
5279+ ; KNL_32-NEXT: vpslld $4, (%ecx), %zmm0
5280+ ; KNL_32-NEXT: vgatherdps (%eax,%zmm0), %zmm1 {%k1}
5281+ ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
5282+ ; KNL_32-NEXT: retl
5283+ ;
5284+ ; SKX_SMALL-LABEL: test_gather_structpt_16f32_mask_index:
5285+ ; SKX_SMALL: # %bb.0:
5286+ ; SKX_SMALL-NEXT: vpmovsxbd %xmm0, %zmm0
5287+ ; SKX_SMALL-NEXT: vpslld $31, %zmm0, %zmm0
5288+ ; SKX_SMALL-NEXT: vpmovd2m %zmm0, %k1
5289+ ; SKX_SMALL-NEXT: vmovdqu64 (%rsi), %zmm0
5290+ ; SKX_SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
5291+ ; SKX_SMALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
5292+ ; SKX_SMALL-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
5293+ ; SKX_SMALL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
5294+ ; SKX_SMALL-NEXT: vpsllq $4, %zmm0, %zmm0
5295+ ; SKX_SMALL-NEXT: vpsllq $4, %zmm2, %zmm2
5296+ ; SKX_SMALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3
5297+ ; SKX_SMALL-NEXT: kshiftrw $8, %k1, %k2
5298+ ; SKX_SMALL-NEXT: vgatherqps (%rdi,%zmm2), %ymm3 {%k2}
5299+ ; SKX_SMALL-NEXT: vgatherqps (%rdi,%zmm0), %ymm1 {%k1}
5300+ ; SKX_SMALL-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0
5301+ ; SKX_SMALL-NEXT: retq
5302+ ;
5303+ ; SKX_LARGE-LABEL: test_gather_structpt_16f32_mask_index:
5304+ ; SKX_LARGE: # %bb.0:
5305+ ; SKX_LARGE-NEXT: vpmovsxbd %xmm0, %zmm0
5306+ ; SKX_LARGE-NEXT: vpslld $31, %zmm0, %zmm0
5307+ ; SKX_LARGE-NEXT: vpmovd2m %zmm0, %k1
5308+ ; SKX_LARGE-NEXT: vmovdqu64 (%rsi), %zmm0
5309+ ; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
5310+ ; SKX_LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0
5311+ ; SKX_LARGE-NEXT: vextracti64x4 $1, %zmm0, %ymm2
5312+ ; SKX_LARGE-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
5313+ ; SKX_LARGE-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
5314+ ; SKX_LARGE-NEXT: vpsllq $4, %zmm0, %zmm0
5315+ ; SKX_LARGE-NEXT: vpsllq $4, %zmm2, %zmm2
5316+ ; SKX_LARGE-NEXT: vextractf64x4 $1, %zmm1, %ymm3
5317+ ; SKX_LARGE-NEXT: kshiftrw $8, %k1, %k2
5318+ ; SKX_LARGE-NEXT: vgatherqps (%rdi,%zmm2), %ymm3 {%k2}
5319+ ; SKX_LARGE-NEXT: vgatherqps (%rdi,%zmm0), %ymm1 {%k1}
5320+ ; SKX_LARGE-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0
5321+ ; SKX_LARGE-NEXT: retq
5322+ ;
5323+ ; SKX_32-LABEL: test_gather_structpt_16f32_mask_index:
5324+ ; SKX_32: # %bb.0:
5325+ ; SKX_32-NEXT: vpmovsxbd %xmm0, %zmm0
5326+ ; SKX_32-NEXT: vpslld $31, %zmm0, %zmm0
5327+ ; SKX_32-NEXT: vpmovd2m %zmm0, %k1
5328+ ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
5329+ ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %ecx
5330+ ; SKX_32-NEXT: vpslld $4, (%ecx), %zmm0
5331+ ; SKX_32-NEXT: vgatherdps (%eax,%zmm0), %zmm1 {%k1}
5332+ ; SKX_32-NEXT: vmovaps %zmm1, %zmm0
5333+ ; SKX_32-NEXT: retl
5334+ %wide.load = load <16 x i32 >, ptr %arr , align 4
5335+ %and = and <16 x i32 > %wide.load , <i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 >
5336+ %zext = zext <16 x i32 > %and to <16 x i64 >
5337+ %ptrs = getelementptr inbounds %struct.pt , ptr %x , <16 x i64 > %zext
5338+ %res = call <16 x float > @llvm.masked.gather.v16f32.v16p0 (<16 x ptr > %ptrs , i32 4 , <16 x i1 > %mask , <16 x float > %src0 )
5339+ ret <16 x float > %res
5340+ }
5341+
5342+ define <16 x float > @test_gather_structpt_16f32_mask_index_offset (ptr %x , ptr %arr , <16 x i1 > %mask , <16 x float > %src0 ) {
5343+ ; KNL_64-LABEL: test_gather_structpt_16f32_mask_index_offset:
5344+ ; KNL_64: # %bb.0:
5345+ ; KNL_64-NEXT: vpmovsxbd %xmm0, %zmm0
5346+ ; KNL_64-NEXT: vpslld $31, %zmm0, %zmm0
5347+ ; KNL_64-NEXT: vptestmd %zmm0, %zmm0, %k1
5348+ ; KNL_64-NEXT: vmovdqu64 (%rsi), %zmm0
5349+ ; KNL_64-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
5350+ ; KNL_64-NEXT: vextracti64x4 $1, %zmm0, %ymm2
5351+ ; KNL_64-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
5352+ ; KNL_64-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
5353+ ; KNL_64-NEXT: vpsllq $4, %zmm0, %zmm0
5354+ ; KNL_64-NEXT: vpsllq $4, %zmm2, %zmm2
5355+ ; KNL_64-NEXT: vextractf64x4 $1, %zmm1, %ymm3
5356+ ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
5357+ ; KNL_64-NEXT: vgatherqps 4(%rdi,%zmm2), %ymm3 {%k2}
5358+ ; KNL_64-NEXT: vgatherqps 4(%rdi,%zmm0), %ymm1 {%k1}
5359+ ; KNL_64-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0
5360+ ; KNL_64-NEXT: retq
5361+ ;
5362+ ; KNL_32-LABEL: test_gather_structpt_16f32_mask_index_offset:
5363+ ; KNL_32: # %bb.0:
5364+ ; KNL_32-NEXT: vpmovsxbd %xmm0, %zmm0
5365+ ; KNL_32-NEXT: vpslld $31, %zmm0, %zmm0
5366+ ; KNL_32-NEXT: vptestmd %zmm0, %zmm0, %k1
5367+ ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
5368+ ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %ecx
5369+ ; KNL_32-NEXT: vpslld $4, (%ecx), %zmm0
5370+ ; KNL_32-NEXT: vgatherdps 4(%eax,%zmm0), %zmm1 {%k1}
5371+ ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
5372+ ; KNL_32-NEXT: retl
5373+ ;
5374+ ; SKX_SMALL-LABEL: test_gather_structpt_16f32_mask_index_offset:
5375+ ; SKX_SMALL: # %bb.0:
5376+ ; SKX_SMALL-NEXT: vpmovsxbd %xmm0, %zmm0
5377+ ; SKX_SMALL-NEXT: vpslld $31, %zmm0, %zmm0
5378+ ; SKX_SMALL-NEXT: vpmovd2m %zmm0, %k1
5379+ ; SKX_SMALL-NEXT: vmovdqu64 (%rsi), %zmm0
5380+ ; SKX_SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
5381+ ; SKX_SMALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
5382+ ; SKX_SMALL-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
5383+ ; SKX_SMALL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
5384+ ; SKX_SMALL-NEXT: vpsllq $4, %zmm0, %zmm0
5385+ ; SKX_SMALL-NEXT: vpsllq $4, %zmm2, %zmm2
5386+ ; SKX_SMALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3
5387+ ; SKX_SMALL-NEXT: kshiftrw $8, %k1, %k2
5388+ ; SKX_SMALL-NEXT: vgatherqps 4(%rdi,%zmm2), %ymm3 {%k2}
5389+ ; SKX_SMALL-NEXT: vgatherqps 4(%rdi,%zmm0), %ymm1 {%k1}
5390+ ; SKX_SMALL-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0
5391+ ; SKX_SMALL-NEXT: retq
5392+ ;
5393+ ; SKX_LARGE-LABEL: test_gather_structpt_16f32_mask_index_offset:
5394+ ; SKX_LARGE: # %bb.0:
5395+ ; SKX_LARGE-NEXT: vpmovsxbd %xmm0, %zmm0
5396+ ; SKX_LARGE-NEXT: vpslld $31, %zmm0, %zmm0
5397+ ; SKX_LARGE-NEXT: vpmovd2m %zmm0, %k1
5398+ ; SKX_LARGE-NEXT: vmovdqu64 (%rsi), %zmm0
5399+ ; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
5400+ ; SKX_LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0
5401+ ; SKX_LARGE-NEXT: vextracti64x4 $1, %zmm0, %ymm2
5402+ ; SKX_LARGE-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
5403+ ; SKX_LARGE-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
5404+ ; SKX_LARGE-NEXT: vpsllq $4, %zmm0, %zmm0
5405+ ; SKX_LARGE-NEXT: vpsllq $4, %zmm2, %zmm2
5406+ ; SKX_LARGE-NEXT: vextractf64x4 $1, %zmm1, %ymm3
5407+ ; SKX_LARGE-NEXT: kshiftrw $8, %k1, %k2
5408+ ; SKX_LARGE-NEXT: vgatherqps 4(%rdi,%zmm2), %ymm3 {%k2}
5409+ ; SKX_LARGE-NEXT: vgatherqps 4(%rdi,%zmm0), %ymm1 {%k1}
5410+ ; SKX_LARGE-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0
5411+ ; SKX_LARGE-NEXT: retq
5412+ ;
5413+ ; SKX_32-LABEL: test_gather_structpt_16f32_mask_index_offset:
5414+ ; SKX_32: # %bb.0:
5415+ ; SKX_32-NEXT: vpmovsxbd %xmm0, %zmm0
5416+ ; SKX_32-NEXT: vpslld $31, %zmm0, %zmm0
5417+ ; SKX_32-NEXT: vpmovd2m %zmm0, %k1
5418+ ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
5419+ ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %ecx
5420+ ; SKX_32-NEXT: vpslld $4, (%ecx), %zmm0
5421+ ; SKX_32-NEXT: vgatherdps 4(%eax,%zmm0), %zmm1 {%k1}
5422+ ; SKX_32-NEXT: vmovaps %zmm1, %zmm0
5423+ ; SKX_32-NEXT: retl
5424+ %wide.load = load <16 x i32 >, ptr %arr , align 4
5425+ %and = and <16 x i32 > %wide.load , <i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 >
5426+ %zext = zext <16 x i32 > %and to <16 x i64 >
5427+ %ptrs = getelementptr inbounds %struct.pt , ptr %x , <16 x i64 > %zext , i32 1
5428+ %res = call <16 x float > @llvm.masked.gather.v16f32.v16p0 (<16 x ptr > %ptrs , i32 4 , <16 x i1 > %mask , <16 x float > %src0 )
5429+ ret <16 x float > %res
5430+ }
5431+
5432+ define {<16 x float >, <16 x float >} @test_gather_16f32_mask_index_pair (ptr %x , ptr %arr , <16 x i1 > %mask , <16 x float > %src0 ) {
5433+ ; KNL_64-LABEL: test_gather_16f32_mask_index_pair:
5434+ ; KNL_64: # %bb.0:
5435+ ; KNL_64-NEXT: vpmovsxbd %xmm0, %zmm0
5436+ ; KNL_64-NEXT: vpslld $31, %zmm0, %zmm0
5437+ ; KNL_64-NEXT: vptestmd %zmm0, %zmm0, %k1
5438+ ; KNL_64-NEXT: vmovdqu64 (%rsi), %zmm0
5439+ ; KNL_64-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
5440+ ; KNL_64-NEXT: vextracti64x4 $1, %zmm0, %ymm2
5441+ ; KNL_64-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
5442+ ; KNL_64-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
5443+ ; KNL_64-NEXT: vpsllq $4, %zmm0, %zmm3
5444+ ; KNL_64-NEXT: vpsllq $4, %zmm2, %zmm2
5445+ ; KNL_64-NEXT: vextractf64x4 $1, %zmm1, %ymm4
5446+ ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
5447+ ; KNL_64-NEXT: kmovw %k2, %k3
5448+ ; KNL_64-NEXT: vmovaps %ymm4, %ymm0
5449+ ; KNL_64-NEXT: vgatherqps (%rdi,%zmm2), %ymm0 {%k3}
5450+ ; KNL_64-NEXT: vmovaps %ymm1, %ymm5
5451+ ; KNL_64-NEXT: kmovw %k1, %k3
5452+ ; KNL_64-NEXT: vgatherqps (%rdi,%zmm3), %ymm5 {%k3}
5453+ ; KNL_64-NEXT: vinsertf64x4 $1, %ymm0, %zmm5, %zmm0
5454+ ; KNL_64-NEXT: vgatherqps 4(%rdi,%zmm2), %ymm4 {%k2}
5455+ ; KNL_64-NEXT: vgatherqps 4(%rdi,%zmm3), %ymm1 {%k1}
5456+ ; KNL_64-NEXT: vinsertf64x4 $1, %ymm4, %zmm1, %zmm1
5457+ ; KNL_64-NEXT: retq
5458+ ;
5459+ ; KNL_32-LABEL: test_gather_16f32_mask_index_pair:
5460+ ; KNL_32: # %bb.0:
5461+ ; KNL_32-NEXT: vpmovsxbd %xmm0, %zmm0
5462+ ; KNL_32-NEXT: vpslld $31, %zmm0, %zmm0
5463+ ; KNL_32-NEXT: vptestmd %zmm0, %zmm0, %k1
5464+ ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
5465+ ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %ecx
5466+ ; KNL_32-NEXT: vpslld $4, (%ecx), %zmm2
5467+ ; KNL_32-NEXT: vpbroadcastd %eax, %zmm0
5468+ ; KNL_32-NEXT: vpaddd %zmm2, %zmm0, %zmm3
5469+ ; KNL_32-NEXT: kmovw %k1, %k2
5470+ ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
5471+ ; KNL_32-NEXT: vgatherdps (%eax,%zmm2), %zmm0 {%k2}
5472+ ; KNL_32-NEXT: vgatherdps 4(,%zmm3), %zmm1 {%k1}
5473+ ; KNL_32-NEXT: retl
5474+ ;
5475+ ; SKX_SMALL-LABEL: test_gather_16f32_mask_index_pair:
5476+ ; SKX_SMALL: # %bb.0:
5477+ ; SKX_SMALL-NEXT: vpmovsxbd %xmm0, %zmm0
5478+ ; SKX_SMALL-NEXT: vpslld $31, %zmm0, %zmm0
5479+ ; SKX_SMALL-NEXT: vpmovd2m %zmm0, %k1
5480+ ; SKX_SMALL-NEXT: vmovdqu64 (%rsi), %zmm0
5481+ ; SKX_SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
5482+ ; SKX_SMALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
5483+ ; SKX_SMALL-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
5484+ ; SKX_SMALL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
5485+ ; SKX_SMALL-NEXT: vpsllq $4, %zmm0, %zmm3
5486+ ; SKX_SMALL-NEXT: vpsllq $4, %zmm2, %zmm2
5487+ ; SKX_SMALL-NEXT: vextractf64x4 $1, %zmm1, %ymm4
5488+ ; SKX_SMALL-NEXT: kshiftrw $8, %k1, %k2
5489+ ; SKX_SMALL-NEXT: kmovw %k2, %k3
5490+ ; SKX_SMALL-NEXT: vmovaps %ymm4, %ymm0
5491+ ; SKX_SMALL-NEXT: vgatherqps (%rdi,%zmm2), %ymm0 {%k3}
5492+ ; SKX_SMALL-NEXT: vmovaps %ymm1, %ymm5
5493+ ; SKX_SMALL-NEXT: kmovw %k1, %k3
5494+ ; SKX_SMALL-NEXT: vgatherqps (%rdi,%zmm3), %ymm5 {%k3}
5495+ ; SKX_SMALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm5, %zmm0
5496+ ; SKX_SMALL-NEXT: vgatherqps 4(%rdi,%zmm2), %ymm4 {%k2}
5497+ ; SKX_SMALL-NEXT: vgatherqps 4(%rdi,%zmm3), %ymm1 {%k1}
5498+ ; SKX_SMALL-NEXT: vinsertf64x4 $1, %ymm4, %zmm1, %zmm1
5499+ ; SKX_SMALL-NEXT: retq
5500+ ;
5501+ ; SKX_LARGE-LABEL: test_gather_16f32_mask_index_pair:
5502+ ; SKX_LARGE: # %bb.0:
5503+ ; SKX_LARGE-NEXT: vpmovsxbd %xmm0, %zmm0
5504+ ; SKX_LARGE-NEXT: vpslld $31, %zmm0, %zmm0
5505+ ; SKX_LARGE-NEXT: vpmovd2m %zmm0, %k1
5506+ ; SKX_LARGE-NEXT: vmovdqu64 (%rsi), %zmm0
5507+ ; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
5508+ ; SKX_LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0
5509+ ; SKX_LARGE-NEXT: vextracti64x4 $1, %zmm0, %ymm2
5510+ ; SKX_LARGE-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
5511+ ; SKX_LARGE-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
5512+ ; SKX_LARGE-NEXT: vpsllq $4, %zmm0, %zmm3
5513+ ; SKX_LARGE-NEXT: vpsllq $4, %zmm2, %zmm2
5514+ ; SKX_LARGE-NEXT: vextractf64x4 $1, %zmm1, %ymm4
5515+ ; SKX_LARGE-NEXT: kshiftrw $8, %k1, %k2
5516+ ; SKX_LARGE-NEXT: vmovaps %ymm4, %ymm0
5517+ ; SKX_LARGE-NEXT: kmovw %k2, %k3
5518+ ; SKX_LARGE-NEXT: vgatherqps (%rdi,%zmm2), %ymm0 {%k3}
5519+ ; SKX_LARGE-NEXT: vmovaps %ymm1, %ymm5
5520+ ; SKX_LARGE-NEXT: kmovw %k1, %k3
5521+ ; SKX_LARGE-NEXT: vgatherqps (%rdi,%zmm3), %ymm5 {%k3}
5522+ ; SKX_LARGE-NEXT: vinsertf64x4 $1, %ymm0, %zmm5, %zmm0
5523+ ; SKX_LARGE-NEXT: vgatherqps 4(%rdi,%zmm2), %ymm4 {%k2}
5524+ ; SKX_LARGE-NEXT: vgatherqps 4(%rdi,%zmm3), %ymm1 {%k1}
5525+ ; SKX_LARGE-NEXT: vinsertf64x4 $1, %ymm4, %zmm1, %zmm1
5526+ ; SKX_LARGE-NEXT: retq
5527+ ;
5528+ ; SKX_32-LABEL: test_gather_16f32_mask_index_pair:
5529+ ; SKX_32: # %bb.0:
5530+ ; SKX_32-NEXT: vpmovsxbd %xmm0, %zmm0
5531+ ; SKX_32-NEXT: vpslld $31, %zmm0, %zmm0
5532+ ; SKX_32-NEXT: vpmovd2m %zmm0, %k1
5533+ ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
5534+ ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %ecx
5535+ ; SKX_32-NEXT: vpslld $4, (%ecx), %zmm2
5536+ ; SKX_32-NEXT: vpbroadcastd %eax, %zmm0
5537+ ; SKX_32-NEXT: vpaddd %zmm2, %zmm0, %zmm3
5538+ ; SKX_32-NEXT: kmovw %k1, %k2
5539+ ; SKX_32-NEXT: vmovaps %zmm1, %zmm0
5540+ ; SKX_32-NEXT: vgatherdps (%eax,%zmm2), %zmm0 {%k2}
5541+ ; SKX_32-NEXT: vgatherdps 4(,%zmm3), %zmm1 {%k1}
5542+ ; SKX_32-NEXT: retl
5543+ %wide.load = load <16 x i32 >, ptr %arr , align 4
5544+ %and = and <16 x i32 > %wide.load , <i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 , i32 536870911 >
5545+ %zext = zext <16 x i32 > %and to <16 x i64 >
5546+ %ptrs1 = getelementptr inbounds %struct.pt , ptr %x , <16 x i64 > %zext
5547+ %res1 = call <16 x float > @llvm.masked.gather.v16f32.v16p0 (<16 x ptr > %ptrs1 , i32 4 , <16 x i1 > %mask , <16 x float > %src0 )
5548+ %ptrs = getelementptr inbounds %struct.pt , ptr %x , <16 x i64 > %zext , i32 1
5549+ %res = call <16 x float > @llvm.masked.gather.v16f32.v16p0 (<16 x ptr > %ptrs , i32 4 , <16 x i1 > %mask , <16 x float > %src0 )
5550+ %pair1 = insertvalue {<16 x float >, <16 x float >} undef , <16 x float > %res1 , 0
5551+ %pair2 = insertvalue {<16 x float >, <16 x float >} %pair1 , <16 x float > %res , 1
5552+ ret {<16 x float >, <16 x float >} %pair2
5553+ }
0 commit comments