@@ -1303,13 +1303,18 @@ define i32 @bitcast_v2bf16_to_i32(<2 x bfloat> %a, i32 %b) {
13031303; GFX11-TRUE16-LABEL: bitcast_v2bf16_to_i32:
13041304; GFX11-TRUE16: ; %bb.0:
13051305; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1306- ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
1307- ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v1
1308- ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
1306+ ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
1307+ ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1
1308+ ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
13091309; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1310+ ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
1311+ ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
1312+ ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v0
1313+ ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0
1314+ ; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow
13101315; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
1311- ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2
1312- ; GFX11-TRUE16-NEXT: ; %bb.1 : ; %cmp.true
1316+ ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_4
1317+ ; GFX11-TRUE16-NEXT: ; %bb.3 : ; %cmp.true
13131318; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0
13141319; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13151320; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1322,15 +1327,16 @@ define i32 @bitcast_v2bf16_to_i32(<2 x bfloat> %a, i32 %b) {
13221327; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
13231328; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
13241329; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
1325- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2 )
1326- ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1 , v2, v4, vcc_lo
1330+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3 )
1331+ ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2 , v2, v4, vcc_lo
13271332; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
1328- ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
1329- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
1330- ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
1331- ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
1332- ; GFX11-TRUE16-NEXT: .LBB14_2: ; %end
1333+ ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
1334+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
1335+ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
1336+ ; GFX11-TRUE16-NEXT: .LBB14_4: ; %end
13331337; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
1338+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
1339+ ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v1
13341340; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
13351341;
13361342; GFX11-FAKE16-LABEL: bitcast_v2bf16_to_i32:
@@ -3543,13 +3549,18 @@ define float @bitcast_v2bf16_to_f32(<2 x bfloat> %a, i32 %b) {
35433549; GFX11-TRUE16-LABEL: bitcast_v2bf16_to_f32:
35443550; GFX11-TRUE16: ; %bb.0:
35453551; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3546- ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
3547- ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v1
3548- ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
3552+ ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
3553+ ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1
3554+ ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
35493555; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
3556+ ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
3557+ ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
3558+ ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v0
3559+ ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0
3560+ ; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow
35503561; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
3551- ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB34_2
3552- ; GFX11-TRUE16-NEXT: ; %bb.1 : ; %cmp.true
3562+ ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB34_4
3563+ ; GFX11-TRUE16-NEXT: ; %bb.3 : ; %cmp.true
35533564; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0
35543565; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
35553566; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -3562,15 +3573,16 @@ define float @bitcast_v2bf16_to_f32(<2 x bfloat> %a, i32 %b) {
35623573; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
35633574; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
35643575; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
3565- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2 )
3566- ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1 , v2, v4, vcc_lo
3576+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3 )
3577+ ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2 , v2, v4, vcc_lo
35673578; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
3568- ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
3569- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
3570- ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
3571- ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
3572- ; GFX11-TRUE16-NEXT: .LBB34_2: ; %end
3579+ ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
3580+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
3581+ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
3582+ ; GFX11-TRUE16-NEXT: .LBB34_4: ; %end
35733583; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
3584+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
3585+ ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v1
35743586; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
35753587;
35763588; GFX11-FAKE16-LABEL: bitcast_v2bf16_to_f32:
@@ -7051,13 +7063,18 @@ define <2 x half> @bitcast_v2bf16_to_v2f16(<2 x bfloat> %a, i32 %b) {
70517063; GFX11-TRUE16-LABEL: bitcast_v2bf16_to_v2f16:
70527064; GFX11-TRUE16: ; %bb.0:
70537065; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7054- ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
7055- ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v1
7056- ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
7066+ ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
7067+ ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1
7068+ ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
70577069; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
7070+ ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
7071+ ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
7072+ ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v0
7073+ ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0
7074+ ; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow
70587075; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
7059- ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB62_2
7060- ; GFX11-TRUE16-NEXT: ; %bb.1 : ; %cmp.true
7076+ ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB62_4
7077+ ; GFX11-TRUE16-NEXT: ; %bb.3 : ; %cmp.true
70617078; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0
70627079; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
70637080; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -7070,15 +7087,16 @@ define <2 x half> @bitcast_v2bf16_to_v2f16(<2 x bfloat> %a, i32 %b) {
70707087; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
70717088; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
70727089; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
7073- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2 )
7074- ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1 , v2, v4, vcc_lo
7090+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3 )
7091+ ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2 , v2, v4, vcc_lo
70757092; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
7076- ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
7077- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
7078- ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
7079- ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
7080- ; GFX11-TRUE16-NEXT: .LBB62_2: ; %end
7093+ ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
7094+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
7095+ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
7096+ ; GFX11-TRUE16-NEXT: .LBB62_4: ; %end
70817097; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
7098+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
7099+ ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v1
70827100; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
70837101;
70847102; GFX11-FAKE16-LABEL: bitcast_v2bf16_to_v2f16:
@@ -8488,13 +8506,18 @@ define <1 x i32> @bitcast_v2bf16_to_v1i32(<2 x bfloat> %a, i32 %b) {
84888506; GFX11-TRUE16-LABEL: bitcast_v2bf16_to_v1i32:
84898507; GFX11-TRUE16: ; %bb.0:
84908508; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8491- ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
8492- ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v1
8493- ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
8509+ ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
8510+ ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1
8511+ ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
84948512; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
8513+ ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
8514+ ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
8515+ ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v0
8516+ ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0
8517+ ; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow
84958518; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
8496- ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB72_2
8497- ; GFX11-TRUE16-NEXT: ; %bb.1 : ; %cmp.true
8519+ ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB72_4
8520+ ; GFX11-TRUE16-NEXT: ; %bb.3 : ; %cmp.true
84988521; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0
84998522; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
85008523; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -8507,15 +8530,16 @@ define <1 x i32> @bitcast_v2bf16_to_v1i32(<2 x bfloat> %a, i32 %b) {
85078530; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
85088531; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
85098532; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
8510- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2 )
8511- ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1 , v2, v4, vcc_lo
8533+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3 )
8534+ ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2 , v2, v4, vcc_lo
85128535; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
8513- ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
8514- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
8515- ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
8516- ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
8517- ; GFX11-TRUE16-NEXT: .LBB72_2: ; %end
8536+ ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
8537+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
8538+ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
8539+ ; GFX11-TRUE16-NEXT: .LBB72_4: ; %end
85188540; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
8541+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
8542+ ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v1
85198543; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
85208544;
85218545; GFX11-FAKE16-LABEL: bitcast_v2bf16_to_v1i32:
@@ -9062,15 +9086,14 @@ define <4 x i8> @bitcast_v2bf16_to_v4i8(<2 x bfloat> %a, i32 %b) {
90629086; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
90639087; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
90649088; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
9065- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2 )
9089+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4 )
90669090; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
90679091; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
9068- ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
90699092; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo
9070- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1 ) | instskip(NEXT) | instid1(VALU_DEP_1)
9071- ; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v2
9072- ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 24, v1
9073- ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
9093+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3 ) | instskip(NEXT) | instid1(VALU_DEP_1)
9094+ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
9095+ ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 24, v2
9096+ ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v2
90749097; GFX11-TRUE16-NEXT: .LBB76_4: ; %end
90759098; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
90769099; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
0 commit comments