@@ -231,22 +231,13 @@ define bfloat @v_copysign_bf16_f32(bfloat %mag, float %sign.f32) {
231
231
; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
232
232
; GFX10-NEXT: s_setpc_b64 s[30:31]
233
233
;
234
- ; GFX11TRUE16-LABEL: v_copysign_bf16_f32:
235
- ; GFX11TRUE16: ; %bb.0:
236
- ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
237
- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l
238
- ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
239
- ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000, v0, v1
240
- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
241
- ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
242
- ;
243
- ; GFX11FAKE16-LABEL: v_copysign_bf16_f32:
244
- ; GFX11FAKE16: ; %bb.0:
245
- ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
246
- ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
247
- ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
248
- ; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
249
- ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
234
+ ; GFX11-LABEL: v_copysign_bf16_f32:
235
+ ; GFX11: ; %bb.0:
236
+ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
237
+ ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
238
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
239
+ ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
240
+ ; GFX11-NEXT: s_setpc_b64 s[30:31]
250
241
%sign = fptrunc float %sign.f32 to bfloat
251
242
%op = call bfloat @llvm.copysign.bf16 (bfloat %mag , bfloat %sign )
252
243
ret bfloat %op
@@ -298,22 +289,13 @@ define bfloat @v_copysign_bf16_f64(bfloat %mag, double %sign.f64) {
298
289
; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
299
290
; GFX10-NEXT: s_setpc_b64 s[30:31]
300
291
;
301
- ; GFX11TRUE16-LABEL: v_copysign_bf16_f64:
302
- ; GFX11TRUE16: ; %bb.0:
303
- ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
304
- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l
305
- ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
306
- ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000, v0, v2
307
- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
308
- ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
309
- ;
310
- ; GFX11FAKE16-LABEL: v_copysign_bf16_f64:
311
- ; GFX11FAKE16: ; %bb.0:
312
- ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
313
- ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
314
- ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
315
- ; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
316
- ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
292
+ ; GFX11-LABEL: v_copysign_bf16_f64:
293
+ ; GFX11: ; %bb.0:
294
+ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
295
+ ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2
296
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
297
+ ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
298
+ ; GFX11-NEXT: s_setpc_b64 s[30:31]
317
299
%sign = fptrunc double %sign.f64 to bfloat
318
300
%op = call bfloat @llvm.copysign.bf16 (bfloat %mag , bfloat %sign )
319
301
ret bfloat %op
@@ -499,9 +481,10 @@ define amdgpu_ps i32 @s_copysign_bf16_f32(bfloat inreg %mag, float inreg %sign.f
499
481
;
500
482
; GFX11TRUE16-LABEL: s_copysign_bf16_f32:
501
483
; GFX11TRUE16: ; %bb.0:
502
- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s0
484
+ ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s0
485
+ ; GFX11TRUE16-NEXT: v_lshrrev_b32_e64 v1, 16, s1
503
486
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
504
- ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000 , v0, s1
487
+ ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff , v0, v1
505
488
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
506
489
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
507
490
; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
@@ -575,9 +558,10 @@ define amdgpu_ps i32 @s_copysign_bf16_f64(bfloat inreg %mag, double inreg %sign.
575
558
;
576
559
; GFX11TRUE16-LABEL: s_copysign_bf16_f64:
577
560
; GFX11TRUE16: ; %bb.0:
578
- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s0
561
+ ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s0
562
+ ; GFX11TRUE16-NEXT: v_lshrrev_b32_e64 v1, 16, s2
579
563
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
580
- ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000 , v0, s2
564
+ ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff , v0, v1
581
565
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
582
566
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
583
567
; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
@@ -3677,9 +3661,10 @@ define amdgpu_ps i16 @s_copysign_out_bf16_mag_bf16_sign_f32(bfloat inreg %mag, f
3677
3661
;
3678
3662
; GFX11TRUE16-LABEL: s_copysign_out_bf16_mag_bf16_sign_f32:
3679
3663
; GFX11TRUE16: ; %bb.0:
3680
- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s0
3664
+ ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s0
3665
+ ; GFX11TRUE16-NEXT: v_lshrrev_b32_e64 v1, 16, s1
3681
3666
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3682
- ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000 , v0, s1
3667
+ ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff , v0, v1
3683
3668
; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
3684
3669
; GFX11TRUE16-NEXT: ; return to shader part epilog
3685
3670
;
@@ -3744,9 +3729,10 @@ define amdgpu_ps i16 @s_copysign_out_bf16_mag_bf16_sign_f64(bfloat inreg %mag, d
3744
3729
;
3745
3730
; GFX11TRUE16-LABEL: s_copysign_out_bf16_mag_bf16_sign_f64:
3746
3731
; GFX11TRUE16: ; %bb.0:
3747
- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s0
3732
+ ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s0
3733
+ ; GFX11TRUE16-NEXT: v_lshrrev_b32_e64 v1, 16, s2
3748
3734
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3749
- ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000 , v0, s2
3735
+ ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff , v0, v1
3750
3736
; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
3751
3737
; GFX11TRUE16-NEXT: ; return to shader part epilog
3752
3738
;
@@ -6700,15 +6686,16 @@ define <3 x bfloat> @v_copysign_out_v3bf16_mag_v3bf16_sign_v3f64(<3 x bfloat> %m
6700
6686
; GFX11TRUE16-LABEL: v_copysign_out_v3bf16_mag_v3bf16_sign_v3f64:
6701
6687
; GFX11TRUE16: ; %bb.0:
6702
6688
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6703
- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
6704
- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
6705
- ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000, v0, v5
6689
+ ; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
6690
+ ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v5
6691
+ ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
6692
+ ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
6693
+ ; GFX11TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v2, v4
6694
+ ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v7
6695
+ ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v3
6706
6696
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
6707
- ; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fff0000, v1, v3
6708
- ; GFX11TRUE16-NEXT: v_bfi_b32 v2, 0x7fff0000, v2, v7
6709
- ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
6710
- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
6711
- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
6697
+ ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
6698
+ ; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, v1, v4
6712
6699
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
6713
6700
;
6714
6701
; GFX11FAKE16-LABEL: v_copysign_out_v3bf16_mag_v3bf16_sign_v3f64:
0 commit comments