@@ -799,6 +799,31 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 {
799799; GFX90A-LABEL: optnone_atomicrmw_fadd_f64_expand:
800800; GFX90A: ; %bb.0:
801801; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
802+ ; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base
803+ ; GFX90A-NEXT: s_mov_b32 s6, 32
804+ ; GFX90A-NEXT: s_lshr_b64 s[4:5], s[4:5], s6
805+ ; GFX90A-NEXT: s_getpc_b64 s[6:7]
806+ ; GFX90A-NEXT: s_add_u32 s6, s6, global@rel32@lo+4
807+ ; GFX90A-NEXT: s_addc_u32 s7, s7, global@rel32@hi+12
808+ ; GFX90A-NEXT: s_cmp_eq_u32 s7, s4
809+ ; GFX90A-NEXT: s_cselect_b64 s[4:5], -1, 0
810+ ; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5]
811+ ; GFX90A-NEXT: s_mov_b64 s[4:5], -1
812+ ; GFX90A-NEXT: s_mov_b32 s6, 1
813+ ; GFX90A-NEXT: v_cmp_ne_u32_e64 s[6:7], v2, s6
814+ ; GFX90A-NEXT: s_and_b64 vcc, exec, s[6:7]
815+ ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
816+ ; GFX90A-NEXT: s_cbranch_vccnz .LBB5_3
817+ ; GFX90A-NEXT: .LBB5_1: ; %Flow4
818+ ; GFX90A-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5]
819+ ; GFX90A-NEXT: s_mov_b32 s4, 1
820+ ; GFX90A-NEXT: v_cmp_ne_u32_e64 s[4:5], v4, s4
821+ ; GFX90A-NEXT: s_and_b64 vcc, exec, s[4:5]
822+ ; GFX90A-NEXT: s_cbranch_vccnz .LBB5_10
823+ ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.shared
824+ ; GFX90A-NEXT: ds_add_rtn_f64 v[2:3], v0, v[0:1]
825+ ; GFX90A-NEXT: s_branch .LBB5_10
826+ ; GFX90A-NEXT: .LBB5_3: ; %atomicrmw.check.private
802827; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base
803828; GFX90A-NEXT: s_mov_b32 s6, 32
804829; GFX90A-NEXT: s_lshr_b64 s[4:5], s[4:5], s6
@@ -813,50 +838,54 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 {
813838; GFX90A-NEXT: v_cmp_ne_u32_e64 s[6:7], v2, s6
814839; GFX90A-NEXT: s_and_b64 vcc, exec, s[6:7]
815840; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
816- ; GFX90A-NEXT: s_cbranch_vccnz .LBB5_2
817- ; GFX90A-NEXT: s_branch .LBB5_3
818- ; GFX90A-NEXT: .LBB5_1 : ; %atomicrmw.private
841+ ; GFX90A-NEXT: s_cbranch_vccnz .LBB5_5
842+ ; GFX90A-NEXT: s_branch .LBB5_6
843+ ; GFX90A-NEXT: .LBB5_4 : ; %atomicrmw.private
819844; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
820845; GFX90A-NEXT: s_waitcnt vmcnt(0)
821846; GFX90A-NEXT: v_mov_b32_e32 v3, v2
822- ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1]
823- ; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
824- ; GFX90A-NEXT: buffer_store_dword v0, v0, s[0:3], 0 offen
825- ; GFX90A-NEXT: s_branch .LBB5_6
826- ; GFX90A-NEXT: .LBB5_2: ; %atomicrmw.global
847+ ; GFX90A-NEXT: v_add_f64 v[4:5], v[2:3], v[0:1]
848+ ; GFX90A-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen
849+ ; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen
850+ ; GFX90A-NEXT: s_branch .LBB5_9
851+ ; GFX90A-NEXT: .LBB5_5: ; %atomicrmw.global
852+ ; GFX90A-NEXT: v_mov_b32_e32 v2, 0
827853; GFX90A-NEXT: s_getpc_b64 s[4:5]
828854; GFX90A-NEXT: s_add_u32 s4, s4, global@rel32@lo+4
829855; GFX90A-NEXT: s_addc_u32 s5, s5, global@rel32@hi+12
830- ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
831- ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
856+ ; GFX90A-NEXT: global_load_dwordx2 v[2:3], v2, s[4:5]
832857; GFX90A-NEXT: s_mov_b64 s[4:5], 0
833- ; GFX90A-NEXT: s_branch .LBB5_4
834- ; GFX90A-NEXT: .LBB5_3 : ; %Flow
858+ ; GFX90A-NEXT: s_branch .LBB5_7
859+ ; GFX90A-NEXT: .LBB5_6 : ; %Flow
835860; GFX90A-NEXT: s_and_b64 vcc, exec, s[4:5]
836- ; GFX90A-NEXT: s_cbranch_vccnz .LBB5_1
837- ; GFX90A-NEXT: s_branch .LBB5_6
838- ; GFX90A-NEXT: .LBB5_4 : ; %atomicrmw.start
861+ ; GFX90A-NEXT: s_cbranch_vccnz .LBB5_4
862+ ; GFX90A-NEXT: s_branch .LBB5_9
863+ ; GFX90A-NEXT: .LBB5_7 : ; %atomicrmw.start
839864; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
840- ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
865+ ; GFX90A-NEXT: s_waitcnt vmcnt(0)
841866; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
842867; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
843868; GFX90A-NEXT: s_getpc_b64 s[6:7]
844869; GFX90A-NEXT: s_add_u32 s6, s6, global@rel32@lo+4
845870; GFX90A-NEXT: s_addc_u32 s7, s7, global@rel32@hi+12
846- ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1]
847- ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[6:7 ], v[2:5 ] glc
848- ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
871+ ; GFX90A-NEXT: v_mov_b32_e32 v6, 0
872+ ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v6, v[2:5 ], s[6:7 ] glc
873+ ; GFX90A-NEXT: s_waitcnt vmcnt(0)
849874; GFX90A-NEXT: v_cmp_eq_u64_e64 s[6:7], v[2:3], v[4:5]
850875; GFX90A-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
851876; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
852- ; GFX90A-NEXT: s_cbranch_execnz .LBB5_4
853- ; GFX90A-NEXT: ; %bb.5 : ; %atomicrmw.end1
877+ ; GFX90A-NEXT: s_cbranch_execnz .LBB5_7
878+ ; GFX90A-NEXT: ; %bb.8 : ; %atomicrmw.end1
854879; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
855880; GFX90A-NEXT: s_mov_b64 s[4:5], 0
856- ; GFX90A-NEXT: s_branch .LBB5_3
857- ; GFX90A-NEXT: .LBB5_6: ; %atomicrmw.phi
858- ; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.end
881+ ; GFX90A-NEXT: s_branch .LBB5_6
882+ ; GFX90A-NEXT: .LBB5_9: ; %Flow3
883+ ; GFX90A-NEXT: s_mov_b64 s[4:5], 0
884+ ; GFX90A-NEXT: s_branch .LBB5_1
885+ ; GFX90A-NEXT: .LBB5_10: ; %atomicrmw.phi
886+ ; GFX90A-NEXT: ; %bb.11: ; %atomicrmw.end
859887; GFX90A-NEXT: s_mov_b32 s4, 32
888+ ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
860889; GFX90A-NEXT: v_lshrrev_b64 v[4:5], s4, v[2:3]
861890; GFX90A-NEXT: v_mov_b32_e32 v0, v2
862891; GFX90A-NEXT: v_mov_b32_e32 v1, v4
@@ -866,6 +895,31 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 {
866895; GFX942-LABEL: optnone_atomicrmw_fadd_f64_expand:
867896; GFX942: ; %bb.0:
868897; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
898+ ; GFX942-NEXT: s_mov_b64 s[0:1], src_shared_base
899+ ; GFX942-NEXT: s_mov_b32 s2, 32
900+ ; GFX942-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
901+ ; GFX942-NEXT: s_getpc_b64 s[2:3]
902+ ; GFX942-NEXT: s_add_u32 s2, s2, global@rel32@lo+4
903+ ; GFX942-NEXT: s_addc_u32 s3, s3, global@rel32@hi+12
904+ ; GFX942-NEXT: s_cmp_eq_u32 s3, s0
905+ ; GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0
906+ ; GFX942-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
907+ ; GFX942-NEXT: s_mov_b64 s[0:1], -1
908+ ; GFX942-NEXT: s_mov_b32 s2, 1
909+ ; GFX942-NEXT: v_cmp_ne_u32_e64 s[2:3], v2, s2
910+ ; GFX942-NEXT: s_and_b64 vcc, exec, s[2:3]
911+ ; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3
912+ ; GFX942-NEXT: s_cbranch_vccnz .LBB5_3
913+ ; GFX942-NEXT: .LBB5_1: ; %Flow4
914+ ; GFX942-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1]
915+ ; GFX942-NEXT: s_mov_b32 s0, 1
916+ ; GFX942-NEXT: v_cmp_ne_u32_e64 s[0:1], v4, s0
917+ ; GFX942-NEXT: s_and_b64 vcc, exec, s[0:1]
918+ ; GFX942-NEXT: s_cbranch_vccnz .LBB5_10
919+ ; GFX942-NEXT: ; %bb.2: ; %atomicrmw.shared
920+ ; GFX942-NEXT: ds_add_rtn_f64 v[2:3], v0, v[0:1]
921+ ; GFX942-NEXT: s_branch .LBB5_10
922+ ; GFX942-NEXT: .LBB5_3: ; %atomicrmw.check.private
869923; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base
870924; GFX942-NEXT: s_mov_b32 s2, 32
871925; GFX942-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
@@ -880,48 +934,52 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 {
880934; GFX942-NEXT: v_cmp_ne_u32_e64 s[2:3], v2, s2
881935; GFX942-NEXT: s_and_b64 vcc, exec, s[2:3]
882936; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3
883- ; GFX942-NEXT: s_cbranch_vccnz .LBB5_2
884- ; GFX942-NEXT: s_branch .LBB5_3
885- ; GFX942-NEXT: .LBB5_1 : ; %atomicrmw.private
937+ ; GFX942-NEXT: s_cbranch_vccnz .LBB5_5
938+ ; GFX942-NEXT: s_branch .LBB5_6
939+ ; GFX942-NEXT: .LBB5_4 : ; %atomicrmw.private
886940; GFX942-NEXT: scratch_load_dwordx2 v[2:3], off, s0
887941; GFX942-NEXT: s_waitcnt vmcnt(0)
888- ; GFX942-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1]
889- ; GFX942-NEXT: scratch_store_dwordx2 off, v[0:1], s0
890- ; GFX942-NEXT: s_branch .LBB5_6
891- ; GFX942-NEXT: .LBB5_2: ; %atomicrmw.global
942+ ; GFX942-NEXT: v_add_f64 v[4:5], v[2:3], v[0:1]
943+ ; GFX942-NEXT: scratch_store_dwordx2 off, v[4:5], s0
944+ ; GFX942-NEXT: s_branch .LBB5_9
945+ ; GFX942-NEXT: .LBB5_5: ; %atomicrmw.global
946+ ; GFX942-NEXT: v_mov_b32_e32 v2, 0
892947; GFX942-NEXT: s_getpc_b64 s[0:1]
893948; GFX942-NEXT: s_add_u32 s0, s0, global@rel32@lo+4
894949; GFX942-NEXT: s_addc_u32 s1, s1, global@rel32@hi+12
895- ; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
896- ; GFX942-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
950+ ; GFX942-NEXT: global_load_dwordx2 v[2:3], v2, s[0:1]
897951; GFX942-NEXT: s_mov_b64 s[0:1], 0
898- ; GFX942-NEXT: s_branch .LBB5_4
899- ; GFX942-NEXT: .LBB5_3 : ; %Flow
952+ ; GFX942-NEXT: s_branch .LBB5_7
953+ ; GFX942-NEXT: .LBB5_6 : ; %Flow
900954; GFX942-NEXT: s_and_b64 vcc, exec, s[0:1]
901- ; GFX942-NEXT: s_cbranch_vccnz .LBB5_1
902- ; GFX942-NEXT: s_branch .LBB5_6
903- ; GFX942-NEXT: .LBB5_4 : ; %atomicrmw.start
955+ ; GFX942-NEXT: s_cbranch_vccnz .LBB5_4
956+ ; GFX942-NEXT: s_branch .LBB5_9
957+ ; GFX942-NEXT: .LBB5_7 : ; %atomicrmw.start
904958; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
905- ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
959+ ; GFX942-NEXT: s_waitcnt vmcnt(0)
906960; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
907961; GFX942-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
908962; GFX942-NEXT: s_getpc_b64 s[2:3]
909963; GFX942-NEXT: s_add_u32 s2, s2, global@rel32@lo+4
910964; GFX942-NEXT: s_addc_u32 s3, s3, global@rel32@hi+12
911- ; GFX942-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
912- ; GFX942-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[6:7 ], v [2:5 ] sc0 sc1
913- ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
965+ ; GFX942-NEXT: v_mov_b32_e32 v6, 0
966+ ; GFX942-NEXT: global_atomic_cmpswap_x2 v[2:3], v6, v[2:5 ], s [2:3 ] sc0 sc1
967+ ; GFX942-NEXT: s_waitcnt vmcnt(0)
914968; GFX942-NEXT: v_cmp_eq_u64_e64 s[2:3], v[2:3], v[4:5]
915969; GFX942-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
916970; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1]
917- ; GFX942-NEXT: s_cbranch_execnz .LBB5_4
918- ; GFX942-NEXT: ; %bb.5 : ; %atomicrmw.end1
971+ ; GFX942-NEXT: s_cbranch_execnz .LBB5_7
972+ ; GFX942-NEXT: ; %bb.8 : ; %atomicrmw.end1
919973; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
920974; GFX942-NEXT: s_mov_b64 s[0:1], 0
921- ; GFX942-NEXT: s_branch .LBB5_3
922- ; GFX942-NEXT: .LBB5_6: ; %atomicrmw.phi
923- ; GFX942-NEXT: ; %bb.7: ; %atomicrmw.end
975+ ; GFX942-NEXT: s_branch .LBB5_6
976+ ; GFX942-NEXT: .LBB5_9: ; %Flow3
977+ ; GFX942-NEXT: s_mov_b64 s[0:1], 0
978+ ; GFX942-NEXT: s_branch .LBB5_1
979+ ; GFX942-NEXT: .LBB5_10: ; %atomicrmw.phi
980+ ; GFX942-NEXT: ; %bb.11: ; %atomicrmw.end
924981; GFX942-NEXT: s_mov_b32 s0, 32
982+ ; GFX942-NEXT: s_waitcnt lgkmcnt(0)
925983; GFX942-NEXT: v_lshrrev_b64 v[4:5], s0, v[2:3]
926984; GFX942-NEXT: v_mov_b32_e32 v0, v2
927985; GFX942-NEXT: v_mov_b32_e32 v1, v4
0 commit comments