@@ -4618,58 +4618,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
46184618; GFX12-NEXT: s_wait_samplecnt 0x0
46194619; GFX12-NEXT: s_wait_bvhcnt 0x0
46204620; GFX12-NEXT: s_wait_kmcnt 0x0
4621- ; GFX12-NEXT: v_mov_b32_e32 v2, v0
4622- ; GFX12-NEXT: v_mov_b32_e32 v0, s4
4623- ; GFX12-NEXT: s_addk_co_i32 s4, 0x400
4624- ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
4625- ; GFX12-NEXT: v_mov_b32_e32 v3, s4
4626- ; GFX12-NEXT: s_mov_b32 s4, 0
4627- ; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
4628- ; GFX12-NEXT: .LBB12_1: ; %atomicrmw.start
4629- ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
4630- ; GFX12-NEXT: s_wait_loadcnt 0x0
4631- ; GFX12-NEXT: v_mov_b32_e32 v5, v0
4621+ ; GFX12-NEXT: v_mov_b32_e32 v1, s4
46324622; GFX12-NEXT: s_wait_storecnt 0x0
4633- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4634- ; GFX12-NEXT: v_pk_add_f16 v4, v5, v2
4635- ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
4636- ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
4623+ ; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
46374624; GFX12-NEXT: s_wait_loadcnt 0x0
46384625; GFX12-NEXT: global_inv scope:SCOPE_DEV
4639- ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
4640- ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
4641- ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
4642- ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
4643- ; GFX12-NEXT: s_cbranch_execnz .LBB12_1
4644- ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
4645- ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
46464626; GFX12-NEXT: s_setpc_b64 s[30:31]
46474627;
46484628; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
46494629; GFX940: ; %bb.0:
46504630; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4651- ; GFX940-NEXT: v_mov_b32_e32 v2, v0
4652- ; GFX940-NEXT: v_mov_b32_e32 v0, s4
4653- ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
4654- ; GFX940-NEXT: s_add_i32 s6, s4, 0x400
4655- ; GFX940-NEXT: s_mov_b64 s[4:5], 0
4656- ; GFX940-NEXT: v_mov_b32_e32 v3, s6
4657- ; GFX940-NEXT: .LBB12_1: ; %atomicrmw.start
4658- ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
4659- ; GFX940-NEXT: s_waitcnt vmcnt(0)
4660- ; GFX940-NEXT: v_mov_b32_e32 v5, v0
4661- ; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
4631+ ; GFX940-NEXT: v_mov_b32_e32 v1, s4
46624632; GFX940-NEXT: buffer_wbl2 sc1
4663- ; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
4664- ; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
4633+ ; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 sc0
46654634; GFX940-NEXT: s_waitcnt vmcnt(0)
46664635; GFX940-NEXT: buffer_inv sc1
4667- ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
4668- ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4669- ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5]
4670- ; GFX940-NEXT: s_cbranch_execnz .LBB12_1
4671- ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
4672- ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5]
46734636; GFX940-NEXT: s_setpc_b64 s[30:31]
46744637;
46754638; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
@@ -4735,27 +4698,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
47354698; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
47364699; GFX90A: ; %bb.0:
47374700; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4738- ; GFX90A-NEXT: v_mov_b32_e32 v2, v0
4739- ; GFX90A-NEXT: v_mov_b32_e32 v0, s8
4740- ; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
4741- ; GFX90A-NEXT: s_add_i32 s10, s8, 0x400
4742- ; GFX90A-NEXT: s_mov_b64 s[8:9], 0
4743- ; GFX90A-NEXT: v_mov_b32_e32 v3, s10
4744- ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start
4745- ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
4746- ; GFX90A-NEXT: s_waitcnt vmcnt(0)
4747- ; GFX90A-NEXT: v_mov_b32_e32 v5, v0
4748- ; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2
4749- ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
4750- ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
4701+ ; GFX90A-NEXT: v_mov_b32_e32 v1, s8
4702+ ; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[4:7], 0 offen offset:1024 glc
47514703; GFX90A-NEXT: s_waitcnt vmcnt(0)
47524704; GFX90A-NEXT: buffer_wbinvl1
4753- ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
4754- ; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
4755- ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9]
4756- ; GFX90A-NEXT: s_cbranch_execnz .LBB12_1
4757- ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
4758- ; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9]
47594705; GFX90A-NEXT: s_setpc_b64 s[30:31]
47604706;
47614707; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
@@ -4921,56 +4867,20 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
49214867; GFX12-NEXT: s_wait_bvhcnt 0x0
49224868; GFX12-NEXT: s_wait_kmcnt 0x0
49234869; GFX12-NEXT: v_mov_b32_e32 v1, s4
4924- ; GFX12-NEXT: s_addk_co_i32 s4, 0x400
4925- ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
4926- ; GFX12-NEXT: v_mov_b32_e32 v3, s4
4927- ; GFX12-NEXT: s_mov_b32 s4, 0
4928- ; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen offset:1024
4929- ; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start
4930- ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
4931- ; GFX12-NEXT: s_wait_loadcnt 0x0
4932- ; GFX12-NEXT: v_pk_add_f16 v1, v2, v0
4933- ; GFX12-NEXT: v_mov_b32_e32 v5, v2
49344870; GFX12-NEXT: s_wait_storecnt 0x0
4935- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
4936- ; GFX12-NEXT: v_mov_b32_e32 v4, v1
4937- ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
4938- ; GFX12-NEXT: s_wait_loadcnt 0x0
4871+ ; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024
4872+ ; GFX12-NEXT: s_wait_storecnt 0x0
49394873; GFX12-NEXT: global_inv scope:SCOPE_DEV
4940- ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
4941- ; GFX12-NEXT: v_mov_b32_e32 v2, v4
4942- ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
4943- ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
4944- ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
4945- ; GFX12-NEXT: s_cbranch_execnz .LBB13_1
4946- ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
4947- ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
49484874; GFX12-NEXT: s_setpc_b64 s[30:31]
49494875;
49504876; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
49514877; GFX940: ; %bb.0:
49524878; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
49534879; GFX940-NEXT: v_mov_b32_e32 v1, s4
4954- ; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024
4955- ; GFX940-NEXT: s_add_i32 s6, s4, 0x400
4956- ; GFX940-NEXT: s_mov_b64 s[4:5], 0
4957- ; GFX940-NEXT: v_mov_b32_e32 v1, s6
4958- ; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start
4959- ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
4960- ; GFX940-NEXT: s_waitcnt vmcnt(0)
4961- ; GFX940-NEXT: v_pk_add_f16 v2, v3, v0
49624880; GFX940-NEXT: buffer_wbl2 sc1
4963- ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
4964- ; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen sc0
4881+ ; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024
49654882; GFX940-NEXT: s_waitcnt vmcnt(0)
49664883; GFX940-NEXT: buffer_inv sc1
4967- ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
4968- ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4969- ; GFX940-NEXT: v_mov_b32_e32 v3, v4
4970- ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5]
4971- ; GFX940-NEXT: s_cbranch_execnz .LBB13_1
4972- ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
4973- ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5]
49744884; GFX940-NEXT: s_setpc_b64 s[30:31]
49754885;
49764886; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
@@ -5036,25 +4946,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
50364946; GFX90A: ; %bb.0:
50374947; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
50384948; GFX90A-NEXT: v_mov_b32_e32 v1, s8
5039- ; GFX90A-NEXT: buffer_load_dword v3, v1, s[4:7], 0 offen offset:1024
5040- ; GFX90A-NEXT: s_add_i32 s10, s8, 0x400
5041- ; GFX90A-NEXT: s_mov_b64 s[8:9], 0
5042- ; GFX90A-NEXT: v_mov_b32_e32 v1, s10
5043- ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start
5044- ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
5045- ; GFX90A-NEXT: s_waitcnt vmcnt(0)
5046- ; GFX90A-NEXT: v_pk_add_f16 v2, v3, v0
5047- ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
5048- ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[4:7], 0 offen glc
4949+ ; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[4:7], 0 offen offset:1024
50494950; GFX90A-NEXT: s_waitcnt vmcnt(0)
50504951; GFX90A-NEXT: buffer_wbinvl1
5051- ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
5052- ; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
5053- ; GFX90A-NEXT: v_mov_b32_e32 v3, v4
5054- ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9]
5055- ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1
5056- ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
5057- ; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9]
50584952; GFX90A-NEXT: s_setpc_b64 s[30:31]
50594953;
50604954; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
@@ -5217,8 +5111,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
52175111; GFX12-NEXT: s_wait_samplecnt 0x0
52185112; GFX12-NEXT: s_wait_bvhcnt 0x0
52195113; GFX12-NEXT: s_wait_kmcnt 0x0
5220- ; GFX12-NEXT: v_add_nc_u32_e32 v9, 0x400, v4
52215114; GFX12-NEXT: s_mov_b32 s1, exec_lo
5115+ ; GFX12-NEXT: s_wait_storecnt 0x0
52225116; GFX12-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1
52235117; GFX12-NEXT: v_readfirstlane_b32 s4, v0
52245118; GFX12-NEXT: v_readfirstlane_b32 s5, v1
@@ -5230,59 +5124,24 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
52305124; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
52315125; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
52325126; GFX12-NEXT: s_and_saveexec_b32 s0, s0
5233- ; GFX12-NEXT: buffer_load_b32 v8, v4, s[4:7], null offen offset:1024
5127+ ; GFX12-NEXT: s_wait_loadcnt 0x0
5128+ ; GFX12-NEXT: buffer_atomic_pk_add_f16 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN
5129+ ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
52345130; GFX12-NEXT: ; implicit-def: $vgpr4
52355131; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
52365132; GFX12-NEXT: s_cbranch_execnz .LBB14_1
52375133; GFX12-NEXT: ; %bb.2:
52385134; GFX12-NEXT: s_mov_b32 exec_lo, s1
5239- ; GFX12-NEXT: s_mov_b32 s1, 0
5240- ; GFX12-NEXT: .LBB14_3: ; %atomicrmw.start
5241- ; GFX12-NEXT: ; =>This Loop Header: Depth=1
5242- ; GFX12-NEXT: ; Child Loop BB14_4 Depth 2
5243- ; GFX12-NEXT: s_wait_loadcnt 0x0
5244- ; GFX12-NEXT: v_pk_add_f16 v7, v8, v5
5245- ; GFX12-NEXT: s_mov_b32 s2, exec_lo
5246- ; GFX12-NEXT: s_wait_storecnt 0x0
5247- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
5248- ; GFX12-NEXT: v_mov_b32_e32 v6, v7
5249- ; GFX12-NEXT: v_mov_b32_e32 v7, v8
5250- ; GFX12-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1
5251- ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2
5252- ; GFX12-NEXT: v_readfirstlane_b32 s4, v0
5253- ; GFX12-NEXT: v_readfirstlane_b32 s5, v1
5254- ; GFX12-NEXT: v_readfirstlane_b32 s6, v2
5255- ; GFX12-NEXT: v_readfirstlane_b32 s7, v3
5256- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
5257- ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
5258- ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
5259- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
5260- ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
5261- ; GFX12-NEXT: s_and_saveexec_b32 s0, s0
5262- ; GFX12-NEXT: s_wait_loadcnt 0x0
5263- ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], null offen th:TH_ATOMIC_RETURN
5264- ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
5265- ; GFX12-NEXT: s_cbranch_execnz .LBB14_4
5266- ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
5267- ; GFX12-NEXT: s_mov_b32 exec_lo, s2
52685135; GFX12-NEXT: s_wait_loadcnt 0x0
5269- ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8
5270- ; GFX12-NEXT: v_mov_b32_e32 v8, v6
5136+ ; GFX12-NEXT: v_mov_b32_e32 v0, v5
52715137; GFX12-NEXT: global_inv scope:SCOPE_DEV
5272- ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
5273- ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
5274- ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
5275- ; GFX12-NEXT: s_cbranch_execnz .LBB14_3
5276- ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
5277- ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
5278- ; GFX12-NEXT: v_mov_b32_e32 v0, v6
52795138; GFX12-NEXT: s_setpc_b64 s[30:31]
52805139;
52815140; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall:
52825141; GFX940: ; %bb.0:
52835142; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5284- ; GFX940-NEXT: v_add_u32_e32 v10, 0x400, v4
52855143; GFX940-NEXT: s_mov_b64 s[2:3], exec
5144+ ; GFX940-NEXT: buffer_wbl2 sc1
52865145; GFX940-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1
52875146; GFX940-NEXT: v_readfirstlane_b32 s4, v0
52885147; GFX940-NEXT: v_readfirstlane_b32 s5, v1
@@ -5293,48 +5152,17 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
52935152; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
52945153; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
52955154; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
5296- ; GFX940-NEXT: buffer_load_dword v9, v4, s[4:7], 0 offen offset:1024
5155+ ; GFX940-NEXT: s_waitcnt vmcnt(0)
5156+ ; GFX940-NEXT: buffer_atomic_pk_add_f16 v5, v4, s[4:7], 0 offen offset:1024 sc0
5157+ ; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
52975158; GFX940-NEXT: ; implicit-def: $vgpr4
52985159; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
52995160; GFX940-NEXT: s_cbranch_execnz .LBB14_1
53005161; GFX940-NEXT: ; %bb.2:
53015162; GFX940-NEXT: s_mov_b64 exec, s[2:3]
5302- ; GFX940-NEXT: s_mov_b64 s[2:3], 0
5303- ; GFX940-NEXT: .LBB14_3: ; %atomicrmw.start
5304- ; GFX940-NEXT: ; =>This Loop Header: Depth=1
5305- ; GFX940-NEXT: ; Child Loop BB14_4 Depth 2
5306- ; GFX940-NEXT: s_waitcnt vmcnt(0)
5307- ; GFX940-NEXT: v_pk_add_f16 v8, v9, v5
5308- ; GFX940-NEXT: s_mov_b64 s[8:9], exec
5309- ; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[8:9]
5310- ; GFX940-NEXT: buffer_wbl2 sc1
5311- ; GFX940-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1
5312- ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2
5313- ; GFX940-NEXT: v_readfirstlane_b32 s4, v0
5314- ; GFX940-NEXT: v_readfirstlane_b32 s5, v1
5315- ; GFX940-NEXT: v_readfirstlane_b32 s6, v2
5316- ; GFX940-NEXT: v_readfirstlane_b32 s7, v3
5317- ; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
5318- ; GFX940-NEXT: s_nop 0
5319- ; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
5320- ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
5321- ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
5322- ; GFX940-NEXT: s_waitcnt vmcnt(0)
5323- ; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[4:7], 0 offen sc0
5324- ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
5325- ; GFX940-NEXT: s_cbranch_execnz .LBB14_4
5326- ; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
5327- ; GFX940-NEXT: s_mov_b64 exec, s[8:9]
53285163; GFX940-NEXT: s_waitcnt vmcnt(0)
5329- ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9
5330- ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
5331- ; GFX940-NEXT: v_mov_b32_e32 v9, v6
5164+ ; GFX940-NEXT: v_mov_b32_e32 v0, v5
53325165; GFX940-NEXT: buffer_inv sc1
5333- ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
5334- ; GFX940-NEXT: s_cbranch_execnz .LBB14_3
5335- ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end
5336- ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
5337- ; GFX940-NEXT: v_mov_b32_e32 v0, v6
53385166; GFX940-NEXT: s_setpc_b64 s[30:31]
53395167;
53405168; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall:
@@ -5468,7 +5296,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
54685296; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall:
54695297; GFX90A: ; %bb.0:
54705298; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5471- ; GFX90A-NEXT: v_add_u32_e32 v10, 0x400, v4
54725299; GFX90A-NEXT: s_mov_b64 s[6:7], exec
54735300; GFX90A-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1
54745301; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -5479,47 +5306,17 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
54795306; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
54805307; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
54815308; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
5482- ; GFX90A-NEXT: s_nop 0
5483- ; GFX90A-NEXT: buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024
5309+ ; GFX90A-NEXT: s_waitcnt vmcnt(0)
5310+ ; GFX90A-NEXT: buffer_atomic_pk_add_f16 v5, v4, s[8:11], 0 offen offset:1024 glc
5311+ ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
54845312; GFX90A-NEXT: ; implicit-def: $vgpr4
54855313; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
54865314; GFX90A-NEXT: s_cbranch_execnz .LBB14_1
54875315; GFX90A-NEXT: ; %bb.2:
54885316; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
5489- ; GFX90A-NEXT: s_mov_b64 s[6:7], 0
5490- ; GFX90A-NEXT: .LBB14_3: ; %atomicrmw.start
5491- ; GFX90A-NEXT: ; =>This Loop Header: Depth=1
5492- ; GFX90A-NEXT: ; Child Loop BB14_4 Depth 2
54935317; GFX90A-NEXT: s_waitcnt vmcnt(0)
5494- ; GFX90A-NEXT: v_pk_add_f16 v8, v9, v5
5495- ; GFX90A-NEXT: s_mov_b64 s[12:13], exec
5496- ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1]
5497- ; GFX90A-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1
5498- ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
5499- ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
5500- ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1
5501- ; GFX90A-NEXT: v_readfirstlane_b32 s10, v2
5502- ; GFX90A-NEXT: v_readfirstlane_b32 s11, v3
5503- ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
5504- ; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
5505- ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
5506- ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
5507- ; GFX90A-NEXT: s_waitcnt vmcnt(0)
5508- ; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[8:11], 0 offen glc
5509- ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
5510- ; GFX90A-NEXT: s_cbranch_execnz .LBB14_4
5511- ; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
5512- ; GFX90A-NEXT: s_mov_b64 exec, s[12:13]
5513- ; GFX90A-NEXT: s_waitcnt vmcnt(0)
5514- ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9
5515- ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
5516- ; GFX90A-NEXT: v_mov_b32_e32 v9, v6
5318+ ; GFX90A-NEXT: v_mov_b32_e32 v0, v5
55175319; GFX90A-NEXT: buffer_wbinvl1
5518- ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
5519- ; GFX90A-NEXT: s_cbranch_execnz .LBB14_3
5520- ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end
5521- ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
5522- ; GFX90A-NEXT: v_mov_b32_e32 v0, v6
55235320; GFX90A-NEXT: s_setpc_b64 s[30:31]
55245321;
55255322; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall:
0 commit comments