@@ -137,34 +137,43 @@ define <16 x float> @gather_lower(ptr %base, <16 x i32> %ind, i16 %mask) {
137137
138138; Test case 5: v32i1 mask via bitconvert combined with dynamic condition.
139139; Ensures lower 16 lanes force the KSET1W path without folding into a shuffle.
140- define <32 x i16 > @mask_v32i1_lower16 (<32 x i16 > %a , <32 x i16 > %b ,
141- <32 x i16 > %c , <32 x i16 > %d ) {
140+ define <32 x i16 > @mask_v32i1_lower16 (<32 x i16 > %a , <32 x i16 > %b , <32 x i16 > %c , <32 x i16 > %d ) {
142141; AVX512F-LABEL: mask_v32i1_lower16:
143- ; AVX512F: vextracti64x4
144- ; AVX512F: vpcmpgtw
145- ; AVX512F: vpternlogd
146- ; AVX512F: vinserti64x4
147- ; AVX512F: vpternlogq
142+ ; AVX512F: # %bb.0:
143+ ; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm3
144+ ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm2
145+ ; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm2, %ymm2
146+ ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 = -1
147+ ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
148+ ; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1))
149+ ; AVX512F-NEXT: retq
148150;
149151; AVX512DQ-LABEL: mask_v32i1_lower16:
150- ; AVX512DQ: vextracti64x4
151- ; AVX512DQ: vpcmpgtw
152- ; AVX512DQ: vpternlogd
153- ; AVX512DQ: vinserti64x4
154- ; AVX512DQ: vpternlogq
152+ ; AVX512DQ: # %bb.0:
153+ ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm3, %ymm3
154+ ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm2
155+ ; AVX512DQ-NEXT: vpcmpgtw %ymm3, %ymm2, %ymm2
156+ ; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm3 = -1
157+ ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
158+ ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1))
159+ ; AVX512DQ-NEXT: retq
155160;
156161; AVX512BW-LABEL: mask_v32i1_lower16:
157- ; AVX512BW: movl $65535, %eax
158- ; AVX512BW: kmovd %eax, %k0
159- ; AVX512BW: vpcmpgtw %zmm3, %zmm2, %k1
160- ; AVX512BW: kord %k0, %k1, %k1
161- ; AVX512BW: vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
162+ ; AVX512BW: # %bb.0:
163+ ; AVX512BW-NEXT: movl $65535, %eax # imm = 0xFFFF
164+ ; AVX512BW-NEXT: kmovd %eax, %k0
165+ ; AVX512BW-NEXT: vpcmpgtw %zmm3, %zmm2, %k1
166+ ; AVX512BW-NEXT: kord %k0, %k1, %k1
167+ ; AVX512BW-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
168+ ; AVX512BW-NEXT: retq
162169;
163170; AVX512DQBW-LABEL: mask_v32i1_lower16:
164- ; AVX512DQBW: kxnorw %k0, %k0, %k0
165- ; AVX512DQBW: vpcmpgtw %zmm3, %zmm2, %k1
166- ; AVX512DQBW: kord %k0, %k1, %k1
167- ; AVX512DQBW: vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
171+ ; AVX512DQBW: # %bb.0:
172+ ; AVX512DQBW-NEXT: kxnorw %k0, %k0, %k0
173+ ; AVX512DQBW-NEXT: vpcmpgtw %zmm3, %zmm2, %k1
174+ ; AVX512DQBW-NEXT: kord %k0, %k1, %k1
175+ ; AVX512DQBW-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
176+ ; AVX512DQBW-NEXT: retq
168177 %mask0 = bitcast i32 65535 to <32 x i1 >
169178 %mask1 = icmp sgt <32 x i16 > %c , %d
170179 %mask = or <32 x i1 > %mask0 , %mask1
@@ -174,34 +183,43 @@ define <32 x i16> @mask_v32i1_lower16(<32 x i16> %a, <32 x i16> %b,
174183
175184; Test case 6: v64i1 mask via bitconvert combined with dynamic condition.
176185; Verifies the KSET1D submask pattern survives past SelectionDAG combines.
177- define <64 x i8 > @mask_v64i1_lower32 (<64 x i8 > %a , <64 x i8 > %b ,
178- <64 x i8 > %c , <64 x i8 > %d ) {
186+ define <64 x i8 > @mask_v64i1_lower32 (<64 x i8 > %a , <64 x i8 > %b , <64 x i8 > %c , <64 x i8 > %d ) {
179187; AVX512F-LABEL: mask_v64i1_lower32:
180- ; AVX512F: vextracti64x4
181- ; AVX512F: vpcmpgtb
182- ; AVX512F: vpternlogd
183- ; AVX512F: vinserti64x4
184- ; AVX512F: vpternlogq
188+ ; AVX512F: # %bb.0:
189+ ; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm3
190+ ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm2
191+ ; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm2
192+ ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 = -1
193+ ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
194+ ; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1))
195+ ; AVX512F-NEXT: retq
185196;
186197; AVX512DQ-LABEL: mask_v64i1_lower32:
187- ; AVX512DQ: vextracti64x4
188- ; AVX512DQ: vpcmpgtb
189- ; AVX512DQ: vpternlogd
190- ; AVX512DQ: vinserti64x4
191- ; AVX512DQ: vpternlogq
198+ ; AVX512DQ: # %bb.0:
199+ ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm3, %ymm3
200+ ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm2
201+ ; AVX512DQ-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm2
202+ ; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm3 = -1
203+ ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
204+ ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1))
205+ ; AVX512DQ-NEXT: retq
192206;
193207; AVX512BW-LABEL: mask_v64i1_lower32:
194- ; AVX512BW: movl $4294967295, %eax
195- ; AVX512BW: kmovq %rax, %k0
196- ; AVX512BW: vpcmpgtb %zmm3, %zmm2, %k1
197- ; AVX512BW: korq %k0, %k1, %k1
198- ; AVX512BW: vpblendmb %zmm0, %zmm1, %zmm0 {%k1}
208+ ; AVX512BW: # %bb.0:
209+ ; AVX512BW-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF
210+ ; AVX512BW-NEXT: kmovq %rax, %k0
211+ ; AVX512BW-NEXT: vpcmpgtb %zmm3, %zmm2, %k1
212+ ; AVX512BW-NEXT: korq %k0, %k1, %k1
213+ ; AVX512BW-NEXT: vpblendmb %zmm0, %zmm1, %zmm0 {%k1}
214+ ; AVX512BW-NEXT: retq
199215;
200216; AVX512DQBW-LABEL: mask_v64i1_lower32:
201- ; AVX512DQBW: kxnord %k0, %k0, %k0
202- ; AVX512DQBW: vpcmpgtb %zmm3, %zmm2, %k1
203- ; AVX512DQBW: korq %k0, %k1, %k1
204- ; AVX512DQBW: vpblendmb %zmm0, %zmm1, %zmm0 {%k1}
217+ ; AVX512DQBW: # %bb.0:
218+ ; AVX512DQBW-NEXT: kxnord %k0, %k0, %k0
219+ ; AVX512DQBW-NEXT: vpcmpgtb %zmm3, %zmm2, %k1
220+ ; AVX512DQBW-NEXT: korq %k0, %k1, %k1
221+ ; AVX512DQBW-NEXT: vpblendmb %zmm0, %zmm1, %zmm0 {%k1}
222+ ; AVX512DQBW-NEXT: retq
205223 %mask0 = bitcast i64 4294967295 to <64 x i1 >
206224 %mask1 = icmp sgt <64 x i8 > %c , %d
207225 %mask = or <64 x i1 > %mask0 , %mask1
0 commit comments