@@ -157,6 +157,160 @@ define <8 x float> @f6(<8 x float> %a) {
157157 ret <8 x float > %3
158158}
159159
160+ define half @f7 (half %a ) nounwind {
161+ ; X86-LABEL: f7:
162+ ; X86: # %bb.0:
163+ ; X86-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0
164+ ; X86-NEXT: #ARITH_FENCE
165+ ; X86-NEXT: retl
166+ ;
167+ ; X64-LABEL: f7:
168+ ; X64: # %bb.0:
169+ ; X64-NEXT: #ARITH_FENCE
170+ ; X64-NEXT: retq
171+ %b = call half @llvm.arithmetic.fence.f16 (half %a )
172+ ret half %b
173+ }
174+
175+ define bfloat @f8 (bfloat %a ) nounwind {
176+ ; X86-LABEL: f8:
177+ ; X86: # %bb.0:
178+ ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
179+ ; X86-NEXT: #ARITH_FENCE
180+ ; X86-NEXT: pinsrw $0, %eax, %xmm0
181+ ; X86-NEXT: retl
182+ ;
183+ ; X64-LABEL: f8:
184+ ; X64: # %bb.0:
185+ ; X64-NEXT: pextrw $0, %xmm0, %eax
186+ ; X64-NEXT: #ARITH_FENCE
187+ ; X64-NEXT: pinsrw $0, %eax, %xmm0
188+ ; X64-NEXT: retq
189+ %b = call bfloat @llvm.arithmetic.fence.bf16 (bfloat %a )
190+ ret bfloat %b
191+ }
192+
193+ define <2 x half > @f9 (<2 x half > %a ) nounwind {
194+ ; X86-LABEL: f9:
195+ ; X86: # %bb.0:
196+ ; X86-NEXT: movdqa %xmm0, %xmm1
197+ ; X86-NEXT: psrld $16, %xmm1
198+ ; X86-NEXT: #ARITH_FENCE
199+ ; X86-NEXT: #ARITH_FENCE
200+ ; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
201+ ; X86-NEXT: retl
202+ ;
203+ ; X64-LABEL: f9:
204+ ; X64: # %bb.0:
205+ ; X64-NEXT: movdqa %xmm0, %xmm1
206+ ; X64-NEXT: psrld $16, %xmm1
207+ ; X64-NEXT: #ARITH_FENCE
208+ ; X64-NEXT: #ARITH_FENCE
209+ ; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
210+ ; X64-NEXT: retq
211+ %b = call <2 x half > @llvm.arithmetic.fence.v2f16 (<2 x half > %a )
212+ ret <2 x half > %b
213+ }
214+
215+ define <3 x bfloat> @f10 (<3 x bfloat> %a ) nounwind {
216+ ; X86-LABEL: f10:
217+ ; X86: # %bb.0:
218+ ; X86-NEXT: pextrw $0, %xmm0, %eax
219+ ; X86-NEXT: movdqa %xmm0, %xmm1
220+ ; X86-NEXT: psrld $16, %xmm1
221+ ; X86-NEXT: pextrw $0, %xmm1, %ecx
222+ ; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
223+ ; X86-NEXT: pextrw $0, %xmm0, %edx
224+ ; X86-NEXT: #ARITH_FENCE
225+ ; X86-NEXT: #ARITH_FENCE
226+ ; X86-NEXT: #ARITH_FENCE
227+ ; X86-NEXT: pinsrw $0, %eax, %xmm0
228+ ; X86-NEXT: pinsrw $0, %ecx, %xmm1
229+ ; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
230+ ; X86-NEXT: pinsrw $0, %edx, %xmm1
231+ ; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
232+ ; X86-NEXT: retl
233+ ;
234+ ; X64-LABEL: f10:
235+ ; X64: # %bb.0:
236+ ; X64-NEXT: pextrw $0, %xmm0, %eax
237+ ; X64-NEXT: movdqa %xmm0, %xmm1
238+ ; X64-NEXT: psrld $16, %xmm1
239+ ; X64-NEXT: pextrw $0, %xmm1, %ecx
240+ ; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
241+ ; X64-NEXT: pextrw $0, %xmm0, %edx
242+ ; X64-NEXT: #ARITH_FENCE
243+ ; X64-NEXT: #ARITH_FENCE
244+ ; X64-NEXT: #ARITH_FENCE
245+ ; X64-NEXT: pinsrw $0, %eax, %xmm0
246+ ; X64-NEXT: pinsrw $0, %ecx, %xmm1
247+ ; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
248+ ; X64-NEXT: pinsrw $0, %edx, %xmm1
249+ ; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
250+ ; X64-NEXT: retq
251+ %b = call <3 x bfloat> @llvm.arithmetic.fence.v3bf16 (<3 x bfloat> %a )
252+ ret <3 x bfloat> %b
253+ }
254+
255+ define <4 x bfloat> @f11 (<4 x bfloat> %a ) nounwind {
256+ ; X86-LABEL: f11:
257+ ; X86: # %bb.0:
258+ ; X86-NEXT: pushl %esi
259+ ; X86-NEXT: movdqa %xmm0, %xmm1
260+ ; X86-NEXT: psrlq $48, %xmm1
261+ ; X86-NEXT: pextrw $0, %xmm1, %eax
262+ ; X86-NEXT: movdqa %xmm0, %xmm1
263+ ; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
264+ ; X86-NEXT: pextrw $0, %xmm1, %edx
265+ ; X86-NEXT: pextrw $0, %xmm0, %ecx
266+ ; X86-NEXT: psrld $16, %xmm0
267+ ; X86-NEXT: pextrw $0, %xmm0, %esi
268+ ; X86-NEXT: #ARITH_FENCE
269+ ; X86-NEXT: #ARITH_FENCE
270+ ; X86-NEXT: #ARITH_FENCE
271+ ; X86-NEXT: #ARITH_FENCE
272+ ; X86-NEXT: pinsrw $0, %eax, %xmm0
273+ ; X86-NEXT: pinsrw $0, %edx, %xmm1
274+ ; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
275+ ; X86-NEXT: pinsrw $0, %ecx, %xmm0
276+ ; X86-NEXT: pinsrw $0, %esi, %xmm2
277+ ; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
278+ ; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
279+ ; X86-NEXT: popl %esi
280+ ; X86-NEXT: retl
281+ ;
282+ ; X64-LABEL: f11:
283+ ; X64: # %bb.0:
284+ ; X64-NEXT: movdqa %xmm0, %xmm1
285+ ; X64-NEXT: psrlq $48, %xmm1
286+ ; X64-NEXT: pextrw $0, %xmm1, %eax
287+ ; X64-NEXT: movdqa %xmm0, %xmm1
288+ ; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
289+ ; X64-NEXT: pextrw $0, %xmm1, %ecx
290+ ; X64-NEXT: pextrw $0, %xmm0, %edx
291+ ; X64-NEXT: psrld $16, %xmm0
292+ ; X64-NEXT: pextrw $0, %xmm0, %esi
293+ ; X64-NEXT: #ARITH_FENCE
294+ ; X64-NEXT: #ARITH_FENCE
295+ ; X64-NEXT: #ARITH_FENCE
296+ ; X64-NEXT: #ARITH_FENCE
297+ ; X64-NEXT: pinsrw $0, %eax, %xmm0
298+ ; X64-NEXT: pinsrw $0, %ecx, %xmm1
299+ ; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
300+ ; X64-NEXT: pinsrw $0, %edx, %xmm0
301+ ; X64-NEXT: pinsrw $0, %esi, %xmm2
302+ ; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
303+ ; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
304+ ; X64-NEXT: retq
305+ %b = call <4 x bfloat> @llvm.arithmetic.fence.v4bf16 (<4 x bfloat> %a )
306+ ret <4 x bfloat> %b
307+ }
308+
309+ declare half @llvm.arithmetic.fence.f16 (half )
310+ declare bfloat @llvm.arithmetic.fence.bf16 (bfloat)
311+ declare <2 x half > @llvm.arithmetic.fence.v2f16 (<2 x half >)
312+ declare <3 x bfloat> @llvm.arithmetic.fence.v3bf16 (<3 x bfloat>)
313+ declare <4 x bfloat> @llvm.arithmetic.fence.v4bf16 (<4 x bfloat>)
160314declare float @llvm.arithmetic.fence.f32 (float )
161315declare double @llvm.arithmetic.fence.f64 (double )
162316declare <2 x float > @llvm.arithmetic.fence.v2f32 (<2 x float >)
0 commit comments