@@ -108,13 +108,14 @@ define amdgpu_ps i32 @s_uitofp_i1_to_bf16(i1 inreg %num) {
108108; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0
109109; GFX11-NEXT: v_readfirstlane_b32 s0, v0
110110; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
111+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
111112; GFX11-NEXT: s_bfe_u32 s1, s0, 0x10010
112- ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
113113; GFX11-NEXT: s_add_i32 s1, s1, s0
114114; GFX11-NEXT: s_bitset1_b32 s0, 22
115115; GFX11-NEXT: s_addk_i32 s1, 0x7fff
116116; GFX11-NEXT: s_and_b32 s2, vcc_lo, exec_lo
117117; GFX11-NEXT: s_cselect_b32 s0, s0, s1
118+ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
118119; GFX11-NEXT: s_lshr_b32 s0, s0, 16
119120; GFX11-NEXT: ; return to shader part epilog
120121;
@@ -125,6 +126,7 @@ define amdgpu_ps i32 @s_uitofp_i1_to_bf16(i1 inreg %num) {
125126; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
126127; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0
127128; GFX12-NEXT: v_readfirstlane_b32 s0, v0
129+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
128130; GFX12-NEXT: s_bfe_u32 s1, s0, 0x10010
129131; GFX12-NEXT: s_or_b32 s2, s0, 0x400000
130132; GFX12-NEXT: s_wait_alu 0xfffe
@@ -305,10 +307,11 @@ define amdgpu_ps <2 x i32> @s_uitofp_v2i1_to_v2bf16(<2 x i1> inreg %num) {
305307; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s0
306308; GFX11-NEXT: v_readfirstlane_b32 s2, v0
307309; GFX11-NEXT: v_cmp_u_f32_e64 s1, v0, v0
308- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2 ) | instid1(SALU_CYCLE_1 )
310+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1 ) | instid1(VALU_DEP_2 )
309311; GFX11-NEXT: v_readfirstlane_b32 s0, v1
310312; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
311313; GFX11-NEXT: s_bfe_u32 s3, s0, 0x10010
314+ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
312315; GFX11-NEXT: s_add_i32 s3, s3, s0
313316; GFX11-NEXT: s_bitset1_b32 s0, 22
314317; GFX11-NEXT: s_addk_i32 s3, 0x7fff
@@ -338,6 +341,7 @@ define amdgpu_ps <2 x i32> @s_uitofp_v2i1_to_v2bf16(<2 x i1> inreg %num) {
338341; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
339342; GFX12-NEXT: v_readfirstlane_b32 s2, v0
340343; GFX12-NEXT: v_readfirstlane_b32 s0, v1
344+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
341345; GFX12-NEXT: s_bfe_u32 s1, s0, 0x10010
342346; GFX12-NEXT: s_or_b32 s3, s0, 0x400000
343347; GFX12-NEXT: s_wait_alu 0xfffe
@@ -1161,13 +1165,14 @@ define amdgpu_ps i32 @s_sitofp_i1_to_bf16(i1 inreg %num) {
11611165; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s0
11621166; GFX11-NEXT: v_readfirstlane_b32 s0, v0
11631167; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
1168+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
11641169; GFX11-NEXT: s_bfe_u32 s1, s0, 0x10010
1165- ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
11661170; GFX11-NEXT: s_add_i32 s1, s1, s0
11671171; GFX11-NEXT: s_bitset1_b32 s0, 22
11681172; GFX11-NEXT: s_addk_i32 s1, 0x7fff
11691173; GFX11-NEXT: s_and_b32 s2, vcc_lo, exec_lo
11701174; GFX11-NEXT: s_cselect_b32 s0, s0, s1
1175+ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
11711176; GFX11-NEXT: s_ashr_i32 s0, s0, 16
11721177; GFX11-NEXT: ; return to shader part epilog
11731178;
@@ -1178,6 +1183,7 @@ define amdgpu_ps i32 @s_sitofp_i1_to_bf16(i1 inreg %num) {
11781183; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11791184; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s0
11801185; GFX12-NEXT: v_readfirstlane_b32 s0, v0
1186+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
11811187; GFX12-NEXT: s_bfe_u32 s1, s0, 0x10010
11821188; GFX12-NEXT: s_or_b32 s2, s0, 0x400000
11831189; GFX12-NEXT: s_wait_alu 0xfffe
@@ -1358,10 +1364,11 @@ define amdgpu_ps <2 x i32> @s_sitofp_v2i1_to_v2bf16(<2 x i1> inreg %num) {
13581364; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, s0
13591365; GFX11-NEXT: v_readfirstlane_b32 s2, v0
13601366; GFX11-NEXT: v_cmp_u_f32_e64 s0, v0, v0
1361- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2 ) | instid1(SALU_CYCLE_1 )
1367+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1 ) | instid1(VALU_DEP_2 )
13621368; GFX11-NEXT: v_readfirstlane_b32 s1, v1
13631369; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
13641370; GFX11-NEXT: s_bfe_u32 s3, s1, 0x10010
1371+ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
13651372; GFX11-NEXT: s_add_i32 s3, s3, s1
13661373; GFX11-NEXT: s_bitset1_b32 s1, 22
13671374; GFX11-NEXT: s_addk_i32 s3, 0x7fff
@@ -1391,6 +1398,7 @@ define amdgpu_ps <2 x i32> @s_sitofp_v2i1_to_v2bf16(<2 x i1> inreg %num) {
13911398; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
13921399; GFX12-NEXT: v_readfirstlane_b32 s2, v0
13931400; GFX12-NEXT: v_readfirstlane_b32 s0, v1
1401+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
13941402; GFX12-NEXT: s_bfe_u32 s1, s0, 0x10010
13951403; GFX12-NEXT: s_or_b32 s3, s0, 0x400000
13961404; GFX12-NEXT: s_wait_alu 0xfffe
0 commit comments