@@ -231,22 +231,13 @@ define bfloat @v_copysign_bf16_f32(bfloat %mag, float %sign.f32) {
231231; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
232232; GFX10-NEXT: s_setpc_b64 s[30:31]
233233;
234- ; GFX11TRUE16-LABEL: v_copysign_bf16_f32:
235- ; GFX11TRUE16: ; %bb.0:
236- ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
237- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l
238- ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
239- ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000, v0, v1
240- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
241- ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
242- ;
243- ; GFX11FAKE16-LABEL: v_copysign_bf16_f32:
244- ; GFX11FAKE16: ; %bb.0:
245- ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
246- ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
247- ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
248- ; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
249- ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
234+ ; GFX11-LABEL: v_copysign_bf16_f32:
235+ ; GFX11: ; %bb.0:
236+ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
237+ ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
238+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
239+ ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
240+ ; GFX11-NEXT: s_setpc_b64 s[30:31]
250241 %sign = fptrunc float %sign.f32 to bfloat
251242 %op = call bfloat @llvm.copysign.bf16 (bfloat %mag , bfloat %sign )
252243 ret bfloat %op
@@ -298,22 +289,13 @@ define bfloat @v_copysign_bf16_f64(bfloat %mag, double %sign.f64) {
298289; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
299290; GFX10-NEXT: s_setpc_b64 s[30:31]
300291;
301- ; GFX11TRUE16-LABEL: v_copysign_bf16_f64:
302- ; GFX11TRUE16: ; %bb.0:
303- ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
304- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l
305- ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
306- ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000, v0, v2
307- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
308- ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
309- ;
310- ; GFX11FAKE16-LABEL: v_copysign_bf16_f64:
311- ; GFX11FAKE16: ; %bb.0:
312- ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
313- ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
314- ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
315- ; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
316- ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
292+ ; GFX11-LABEL: v_copysign_bf16_f64:
293+ ; GFX11: ; %bb.0:
294+ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
295+ ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2
296+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
297+ ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
298+ ; GFX11-NEXT: s_setpc_b64 s[30:31]
317299 %sign = fptrunc double %sign.f64 to bfloat
318300 %op = call bfloat @llvm.copysign.bf16 (bfloat %mag , bfloat %sign )
319301 ret bfloat %op
@@ -499,9 +481,10 @@ define amdgpu_ps i32 @s_copysign_bf16_f32(bfloat inreg %mag, float inreg %sign.f
499481;
500482; GFX11TRUE16-LABEL: s_copysign_bf16_f32:
501483; GFX11TRUE16: ; %bb.0:
502- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s0
484+ ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s0
485+ ; GFX11TRUE16-NEXT: v_lshrrev_b32_e64 v1, 16, s1
503486; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
504- ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000 , v0, s1
487+ ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff , v0, v1
505488; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
506489; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
507490; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
@@ -575,9 +558,10 @@ define amdgpu_ps i32 @s_copysign_bf16_f64(bfloat inreg %mag, double inreg %sign.
575558;
576559; GFX11TRUE16-LABEL: s_copysign_bf16_f64:
577560; GFX11TRUE16: ; %bb.0:
578- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s0
561+ ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s0
562+ ; GFX11TRUE16-NEXT: v_lshrrev_b32_e64 v1, 16, s2
579563; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
580- ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000 , v0, s2
564+ ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff , v0, v1
581565; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
582566; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
583567; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
@@ -3677,9 +3661,10 @@ define amdgpu_ps i16 @s_copysign_out_bf16_mag_bf16_sign_f32(bfloat inreg %mag, f
36773661;
36783662; GFX11TRUE16-LABEL: s_copysign_out_bf16_mag_bf16_sign_f32:
36793663; GFX11TRUE16: ; %bb.0:
3680- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s0
3664+ ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s0
3665+ ; GFX11TRUE16-NEXT: v_lshrrev_b32_e64 v1, 16, s1
36813666; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3682- ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000 , v0, s1
3667+ ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff , v0, v1
36833668; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
36843669; GFX11TRUE16-NEXT: ; return to shader part epilog
36853670;
@@ -3744,9 +3729,10 @@ define amdgpu_ps i16 @s_copysign_out_bf16_mag_bf16_sign_f64(bfloat inreg %mag, d
37443729;
37453730; GFX11TRUE16-LABEL: s_copysign_out_bf16_mag_bf16_sign_f64:
37463731; GFX11TRUE16: ; %bb.0:
3747- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s0
3732+ ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s0
3733+ ; GFX11TRUE16-NEXT: v_lshrrev_b32_e64 v1, 16, s2
37483734; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3749- ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000 , v0, s2
3735+ ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff , v0, v1
37503736; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
37513737; GFX11TRUE16-NEXT: ; return to shader part epilog
37523738;
@@ -6700,15 +6686,16 @@ define <3 x bfloat> @v_copysign_out_v3bf16_mag_v3bf16_sign_v3f64(<3 x bfloat> %m
67006686; GFX11TRUE16-LABEL: v_copysign_out_v3bf16_mag_v3bf16_sign_v3f64:
67016687; GFX11TRUE16: ; %bb.0:
67026688; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6703- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
6704- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
6705- ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000, v0, v5
6689+ ; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
6690+ ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v5
6691+ ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
6692+ ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
6693+ ; GFX11TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v2, v4
6694+ ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v7
6695+ ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v3
67066696; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
6707- ; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fff0000, v1, v3
6708- ; GFX11TRUE16-NEXT: v_bfi_b32 v2, 0x7fff0000, v2, v7
6709- ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
6710- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
6711- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
6697+ ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
6698+ ; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, v1, v4
67126699; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
67136700;
67146701; GFX11FAKE16-LABEL: v_copysign_out_v3bf16_mag_v3bf16_sign_v3f64:
0 commit comments