-
Notifications
You must be signed in to change notification settings - Fork 15.2k
Closed
Labels
backend:X86good first issuehttps://github.com/llvm/llvm-project/contributehttps://github.com/llvm/llvm-project/contributemissed-optimization
Description
define <16 x float> @gather_all(ptr %base, <16 x i32> %ind, i16 %mask) {
%broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
%broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
%sext_ind = sext <16 x i32> %ind to <16 x i64>
%gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
%res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float>undef)
ret <16 x float> %res
}
define <16 x float> @gather_lower(ptr %base, <16 x i32> %ind, i16 %mask) {
%broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
%broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
%sext_ind = sext <16 x i32> %ind to <16 x i64>
%gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
%res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float>undef)
ret <16 x float> %res
}llc -mcpu=x86-64-v4
gather_all: # @gather_all
kxnorw %k0, %k0, %k1
vxorps %xmm1, %xmm1, %xmm1
vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
vmovaps %zmm1, %zmm0
retq
gather_lower: # @gather_lower
vxorps %xmm1, %xmm1, %xmm1
movw $255, %ax
kmovd %eax, %k1
vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
vmovaps %zmm1, %zmm0
retqThe gather_lower code - needs to initialize the lower 8 bits of a <16 x i1> k-reg mask (with all upper bits zero) - - we should be able to use kxnorb %k0, %k0, %k1 on AVX512DQ targets to handle this instead of the MOV+KMOVD.
There will be similar cases where we want just the lower 8/16/32 bits sets of a <32 x i1> or <64 x i1> mask on AVX512DQ/BW targets that can be handled with kxnorw/kxnord.
Hopefully this can be be handled as tablegen patterns extending the existing 'KSET1' patterns.
Metadata
Metadata
Assignees
Labels
backend:X86good first issuehttps://github.com/llvm/llvm-project/contributehttps://github.com/llvm/llvm-project/contributemissed-optimization