@@ -135,24 +135,76 @@ define <16 x float> @gather_lower(ptr %base, <16 x i32> %ind, i16 %mask) {
135135 ret <16 x float > %res
136136}
137137
138- ; Test case 5: v32i1 mask via bitconvert, lower 16 bits set (tests bitconvert pattern)
139- define <32 x i16 > @mask_v32i1_lower16 (<32 x i16 > %a , <32 x i16 > %b ) {
140- ; AVX512-LABEL: mask_v32i1_lower16:
141- ; AVX512: # %bb.0:
142- ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
143- ; AVX512-NEXT: retq
144- %mask = bitcast i32 65535 to <32 x i1 >
138+ ; Test case 5: v32i1 mask via bitconvert combined with dynamic condition.
139+ ; Ensures lower 16 lanes force the KSET1W path without folding into a shuffle.
140+ define <32 x i16 > @mask_v32i1_lower16 (<32 x i16 > %a , <32 x i16 > %b ,
141+ <32 x i16 > %c , <32 x i16 > %d ) {
142+ ; AVX512F-LABEL: mask_v32i1_lower16:
143+ ; AVX512F: vextracti64x4
144+ ; AVX512F: vpcmpgtw
145+ ; AVX512F: vpternlogd
146+ ; AVX512F: vinserti64x4
147+ ; AVX512F: vpternlogq
148+ ;
149+ ; AVX512DQ-LABEL: mask_v32i1_lower16:
150+ ; AVX512DQ: vextracti64x4
151+ ; AVX512DQ: vpcmpgtw
152+ ; AVX512DQ: vpternlogd
153+ ; AVX512DQ: vinserti64x4
154+ ; AVX512DQ: vpternlogq
155+ ;
156+ ; AVX512BW-LABEL: mask_v32i1_lower16:
157+ ; AVX512BW: movl $65535, %eax
158+ ; AVX512BW: kmovd %eax, %k0
159+ ; AVX512BW: vpcmpgtw %zmm3, %zmm2, %k1
160+ ; AVX512BW: kord %k0, %k1, %k1
161+ ; AVX512BW: vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
162+ ;
163+ ; AVX512DQBW-LABEL: mask_v32i1_lower16:
164+ ; AVX512DQBW: kxnorw %k0, %k0, %k0
165+ ; AVX512DQBW: vpcmpgtw %zmm3, %zmm2, %k1
166+ ; AVX512DQBW: kord %k0, %k1, %k1
167+ ; AVX512DQBW: vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
168+ %mask0 = bitcast i32 65535 to <32 x i1 >
169+ %mask1 = icmp sgt <32 x i16 > %c , %d
170+ %mask = or <32 x i1 > %mask0 , %mask1
145171 %res = select <32 x i1 > %mask , <32 x i16 > %a , <32 x i16 > %b
146172 ret <32 x i16 > %res
147173}
148174
149- ; Test case 6: v64i1 mask via bitconvert, lower 32 bits set (tests bitconvert pattern)
150- define <64 x i8 > @mask_v64i1_lower32 (<64 x i8 > %a , <64 x i8 > %b ) {
151- ; AVX512-LABEL: mask_v64i1_lower32:
152- ; AVX512: # %bb.0:
153- ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
154- ; AVX512-NEXT: retq
155- %mask = bitcast i64 4294967295 to <64 x i1 >
175+ ; Test case 6: v64i1 mask via bitconvert combined with dynamic condition.
176+ ; Verifies the KSET1D submask pattern survives past SelectionDAG combines.
177+ define <64 x i8 > @mask_v64i1_lower32 (<64 x i8 > %a , <64 x i8 > %b ,
178+ <64 x i8 > %c , <64 x i8 > %d ) {
179+ ; AVX512F-LABEL: mask_v64i1_lower32:
180+ ; AVX512F: vextracti64x4
181+ ; AVX512F: vpcmpgtb
182+ ; AVX512F: vpternlogd
183+ ; AVX512F: vinserti64x4
184+ ; AVX512F: vpternlogq
185+ ;
186+ ; AVX512DQ-LABEL: mask_v64i1_lower32:
187+ ; AVX512DQ: vextracti64x4
188+ ; AVX512DQ: vpcmpgtb
189+ ; AVX512DQ: vpternlogd
190+ ; AVX512DQ: vinserti64x4
191+ ; AVX512DQ: vpternlogq
192+ ;
193+ ; AVX512BW-LABEL: mask_v64i1_lower32:
194+ ; AVX512BW: movl $4294967295, %eax
195+ ; AVX512BW: kmovq %rax, %k0
196+ ; AVX512BW: vpcmpgtb %zmm3, %zmm2, %k1
197+ ; AVX512BW: korq %k0, %k1, %k1
198+ ; AVX512BW: vpblendmb %zmm0, %zmm1, %zmm0 {%k1}
199+ ;
200+ ; AVX512DQBW-LABEL: mask_v64i1_lower32:
201+ ; AVX512DQBW: kxnord %k0, %k0, %k0
202+ ; AVX512DQBW: vpcmpgtb %zmm3, %zmm2, %k1
203+ ; AVX512DQBW: korq %k0, %k1, %k1
204+ ; AVX512DQBW: vpblendmb %zmm0, %zmm1, %zmm0 {%k1}
205+ %mask0 = bitcast i64 4294967295 to <64 x i1 >
206+ %mask1 = icmp sgt <64 x i8 > %c , %d
207+ %mask = or <64 x i1 > %mask0 , %mask1
156208 %res = select <64 x i1 > %mask , <64 x i8 > %a , <64 x i8 > %b
157209 ret <64 x i8 > %res
158210}
0 commit comments