@@ -147,14 +147,13 @@ define weak_odr amdgpu_kernel void @dpp_test1(ptr %arg) local_unnamed_addr {
147147; GFX8-OPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
148148; GFX8-OPT-NEXT: v_mov_b32_e32 v2, 0
149149; GFX8-OPT-NEXT: s_waitcnt lgkmcnt(0)
150- ; GFX8-OPT-NEXT: s_barrier
151- ; GFX8-OPT-NEXT: v_add_u32_e32 v1, vcc, v1, v1
152- ; GFX8-OPT-NEXT: s_nop 1
153- ; GFX8-OPT-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
154- ; GFX8-OPT-NEXT: v_add_u32_e32 v2, vcc, v2, v1
155- ; GFX8-OPT-NEXT: v_mov_b32_e32 v1, s1
150+ ; GFX8-OPT-NEXT: v_add_u32_e32 v4, vcc, v1, v1
151+ ; GFX8-OPT-NEXT: v_mov_b32_e32 v3, s1
156152; GFX8-OPT-NEXT: v_add_u32_e32 v0, vcc, s0, v0
157- ; GFX8-OPT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
153+ ; GFX8-OPT-NEXT: v_mov_b32_dpp v2, v4 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
154+ ; GFX8-OPT-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
155+ ; GFX8-OPT-NEXT: v_add_u32_e32 v2, vcc, v2, v4
156+ ; GFX8-OPT-NEXT: s_barrier
158157; GFX8-OPT-NEXT: flat_store_dword v[0:1], v2
159158; GFX8-OPT-NEXT: s_endpgm
160159;
@@ -194,14 +193,14 @@ define weak_odr amdgpu_kernel void @dpp_test1(ptr %arg) local_unnamed_addr {
194193; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
195194; GFX10-NEXT: v_mov_b32_e32 v2, 0
196195; GFX10-NEXT: ds_read_b32 v1, v0
197- ; GFX10-NEXT: s_barrier
198- ; GFX10-NEXT: buffer_gl0_inv
199196; GFX10-NEXT: s_waitcnt lgkmcnt(0)
200197; GFX10-NEXT: v_add_co_u32 v0, s0, s0, v0
201- ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v1
202- ; GFX10-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
203- ; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v1
198+ ; GFX10-NEXT: v_add_nc_u32_e32 v3, v1, v1
204199; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
200+ ; GFX10-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
201+ ; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v3
202+ ; GFX10-NEXT: s_barrier
203+ ; GFX10-NEXT: buffer_gl0_inv
205204; GFX10-NEXT: flat_store_dword v[0:1], v2
206205; GFX10-NEXT: s_endpgm
207206;
@@ -213,15 +212,15 @@ define weak_odr amdgpu_kernel void @dpp_test1(ptr %arg) local_unnamed_addr {
213212; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
214213; GFX11-NEXT: v_and_b32_e32 v0, 0xffc, v0
215214; GFX11-NEXT: ds_load_b32 v1, v0
216- ; GFX11-NEXT: s_barrier
217- ; GFX11-NEXT: buffer_gl0_inv
218215; GFX11-NEXT: s_waitcnt lgkmcnt(0)
219216; GFX11-NEXT: v_add_co_u32 v0, s0, s0, v0
220- ; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v1
221- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
222- ; GFX11-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
223- ; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v1
217+ ; GFX11-NEXT: v_add_nc_u32_e32 v3, v1, v1
224218; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0
219+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
220+ ; GFX11-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
221+ ; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v3
222+ ; GFX11-NEXT: s_barrier
223+ ; GFX11-NEXT: buffer_gl0_inv
225224; GFX11-NEXT: flat_store_b32 v[0:1], v2
226225; GFX11-NEXT: s_endpgm
227226bb:
0 commit comments