@@ -799,6 +799,31 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 {
799799; GFX90A-LABEL: optnone_atomicrmw_fadd_f64_expand: 
800800; GFX90A:       ; %bb.0: 
801801; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 
802+ ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base 
803+ ; GFX90A-NEXT:    s_mov_b32 s6, 32 
804+ ; GFX90A-NEXT:    s_lshr_b64 s[4:5], s[4:5], s6 
805+ ; GFX90A-NEXT:    s_getpc_b64 s[6:7] 
806+ ; GFX90A-NEXT:    s_add_u32 s6, s6, global@rel32@lo+4 
807+ ; GFX90A-NEXT:    s_addc_u32 s7, s7, global@rel32@hi+12 
808+ ; GFX90A-NEXT:    s_cmp_eq_u32 s7, s4 
809+ ; GFX90A-NEXT:    s_cselect_b64 s[4:5], -1, 0 
810+ ; GFX90A-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5] 
811+ ; GFX90A-NEXT:    s_mov_b64 s[4:5], -1 
812+ ; GFX90A-NEXT:    s_mov_b32 s6, 1 
813+ ; GFX90A-NEXT:    v_cmp_ne_u32_e64 s[6:7], v2, s6 
814+ ; GFX90A-NEXT:    s_and_b64 vcc, exec, s[6:7] 
815+ ; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3 
816+ ; GFX90A-NEXT:    s_cbranch_vccnz .LBB5_3 
817+ ; GFX90A-NEXT:  .LBB5_1: ; %Flow4 
818+ ; GFX90A-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5] 
819+ ; GFX90A-NEXT:    s_mov_b32 s4, 1 
820+ ; GFX90A-NEXT:    v_cmp_ne_u32_e64 s[4:5], v4, s4 
821+ ; GFX90A-NEXT:    s_and_b64 vcc, exec, s[4:5] 
822+ ; GFX90A-NEXT:    s_cbranch_vccnz .LBB5_10 
823+ ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.shared 
824+ ; GFX90A-NEXT:    ds_add_rtn_f64 v[2:3], v0, v[0:1] 
825+ ; GFX90A-NEXT:    s_branch .LBB5_10 
826+ ; GFX90A-NEXT:  .LBB5_3: ; %atomicrmw.check.private 
802827; GFX90A-NEXT:    s_mov_b64 s[4:5], src_private_base 
803828; GFX90A-NEXT:    s_mov_b32 s6, 32 
804829; GFX90A-NEXT:    s_lshr_b64 s[4:5], s[4:5], s6 
@@ -813,50 +838,54 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 {
813838; GFX90A-NEXT:    v_cmp_ne_u32_e64 s[6:7], v2, s6 
814839; GFX90A-NEXT:    s_and_b64 vcc, exec, s[6:7] 
815840; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3 
816- ; GFX90A-NEXT:    s_cbranch_vccnz .LBB5_2  
817- ; GFX90A-NEXT:    s_branch .LBB5_3  
818- ; GFX90A-NEXT:  .LBB5_1 : ; %atomicrmw.private 
841+ ; GFX90A-NEXT:    s_cbranch_vccnz .LBB5_5  
842+ ; GFX90A-NEXT:    s_branch .LBB5_6  
843+ ; GFX90A-NEXT:  .LBB5_4 : ; %atomicrmw.private 
819844; GFX90A-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen 
820845; GFX90A-NEXT:    s_waitcnt vmcnt(0) 
821846; GFX90A-NEXT:    v_mov_b32_e32 v3, v2 
822- ; GFX90A-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1] 
823- ; GFX90A-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen 
824- ; GFX90A-NEXT:    buffer_store_dword v0, v0, s[0:3], 0 offen 
825- ; GFX90A-NEXT:    s_branch .LBB5_6 
826- ; GFX90A-NEXT:  .LBB5_2: ; %atomicrmw.global 
847+ ; GFX90A-NEXT:    v_add_f64 v[4:5], v[2:3], v[0:1] 
848+ ; GFX90A-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen 
849+ ; GFX90A-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen 
850+ ; GFX90A-NEXT:    s_branch .LBB5_9 
851+ ; GFX90A-NEXT:  .LBB5_5: ; %atomicrmw.global 
852+ ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0 
827853; GFX90A-NEXT:    s_getpc_b64 s[4:5] 
828854; GFX90A-NEXT:    s_add_u32 s4, s4, global@rel32@lo+4 
829855; GFX90A-NEXT:    s_addc_u32 s5, s5, global@rel32@hi+12 
830- ; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] 
831- ; GFX90A-NEXT:    flat_load_dwordx2 v[2:3], v[2:3] 
856+ ; GFX90A-NEXT:    global_load_dwordx2 v[2:3], v2, s[4:5] 
832857; GFX90A-NEXT:    s_mov_b64 s[4:5], 0 
833- ; GFX90A-NEXT:    s_branch .LBB5_4  
834- ; GFX90A-NEXT:  .LBB5_3 : ; %Flow 
858+ ; GFX90A-NEXT:    s_branch .LBB5_7  
859+ ; GFX90A-NEXT:  .LBB5_6 : ; %Flow 
835860; GFX90A-NEXT:    s_and_b64 vcc, exec, s[4:5] 
836- ; GFX90A-NEXT:    s_cbranch_vccnz .LBB5_1  
837- ; GFX90A-NEXT:    s_branch .LBB5_6  
838- ; GFX90A-NEXT:  .LBB5_4 : ; %atomicrmw.start 
861+ ; GFX90A-NEXT:    s_cbranch_vccnz .LBB5_4  
862+ ; GFX90A-NEXT:    s_branch .LBB5_9  
863+ ; GFX90A-NEXT:  .LBB5_7 : ; %atomicrmw.start 
839864; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1 
840- ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)  
865+ ; GFX90A-NEXT:    s_waitcnt vmcnt(0) 
841866; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] 
842867; GFX90A-NEXT:    v_add_f64 v[2:3], v[4:5], v[0:1] 
843868; GFX90A-NEXT:    s_getpc_b64 s[6:7] 
844869; GFX90A-NEXT:    s_add_u32 s6, s6, global@rel32@lo+4 
845870; GFX90A-NEXT:    s_addc_u32 s7, s7, global@rel32@hi+12 
846- ; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1]  
847- ; GFX90A-NEXT:    flat_atomic_cmpswap_x2  v[2:3], v[6:7 ], v[2:5 ] glc 
848- ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)  
871+ ; GFX90A-NEXT:    v_mov_b32_e32 v6, 0  
872+ ; GFX90A-NEXT:    global_atomic_cmpswap_x2  v[2:3], v6, v[2:5 ], s[6:7 ] glc 
873+ ; GFX90A-NEXT:    s_waitcnt vmcnt(0) 
849874; GFX90A-NEXT:    v_cmp_eq_u64_e64 s[6:7], v[2:3], v[4:5] 
850875; GFX90A-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5] 
851876; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5] 
852- ; GFX90A-NEXT:    s_cbranch_execnz .LBB5_4  
853- ; GFX90A-NEXT:  ; %bb.5 : ; %atomicrmw.end1 
877+ ; GFX90A-NEXT:    s_cbranch_execnz .LBB5_7  
878+ ; GFX90A-NEXT:  ; %bb.8 : ; %atomicrmw.end1 
854879; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5] 
855880; GFX90A-NEXT:    s_mov_b64 s[4:5], 0 
856- ; GFX90A-NEXT:    s_branch .LBB5_3 
857- ; GFX90A-NEXT:  .LBB5_6: ; %atomicrmw.phi 
858- ; GFX90A-NEXT:  ; %bb.7: ; %atomicrmw.end 
881+ ; GFX90A-NEXT:    s_branch .LBB5_6 
882+ ; GFX90A-NEXT:  .LBB5_9: ; %Flow3 
883+ ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0 
884+ ; GFX90A-NEXT:    s_branch .LBB5_1 
885+ ; GFX90A-NEXT:  .LBB5_10: ; %atomicrmw.phi 
886+ ; GFX90A-NEXT:  ; %bb.11: ; %atomicrmw.end 
859887; GFX90A-NEXT:    s_mov_b32 s4, 32 
888+ ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0) 
860889; GFX90A-NEXT:    v_lshrrev_b64 v[4:5], s4, v[2:3] 
861890; GFX90A-NEXT:    v_mov_b32_e32 v0, v2 
862891; GFX90A-NEXT:    v_mov_b32_e32 v1, v4 
@@ -866,6 +895,31 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 {
866895; GFX942-LABEL: optnone_atomicrmw_fadd_f64_expand: 
867896; GFX942:       ; %bb.0: 
868897; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 
898+ ; GFX942-NEXT:    s_mov_b64 s[0:1], src_shared_base 
899+ ; GFX942-NEXT:    s_mov_b32 s2, 32 
900+ ; GFX942-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2 
901+ ; GFX942-NEXT:    s_getpc_b64 s[2:3] 
902+ ; GFX942-NEXT:    s_add_u32 s2, s2, global@rel32@lo+4 
903+ ; GFX942-NEXT:    s_addc_u32 s3, s3, global@rel32@hi+12 
904+ ; GFX942-NEXT:    s_cmp_eq_u32 s3, s0 
905+ ; GFX942-NEXT:    s_cselect_b64 s[0:1], -1, 0 
906+ ; GFX942-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1] 
907+ ; GFX942-NEXT:    s_mov_b64 s[0:1], -1 
908+ ; GFX942-NEXT:    s_mov_b32 s2, 1 
909+ ; GFX942-NEXT:    v_cmp_ne_u32_e64 s[2:3], v2, s2 
910+ ; GFX942-NEXT:    s_and_b64 vcc, exec, s[2:3] 
911+ ; GFX942-NEXT:    ; implicit-def: $vgpr2_vgpr3 
912+ ; GFX942-NEXT:    s_cbranch_vccnz .LBB5_3 
913+ ; GFX942-NEXT:  .LBB5_1: ; %Flow4 
914+ ; GFX942-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[0:1] 
915+ ; GFX942-NEXT:    s_mov_b32 s0, 1 
916+ ; GFX942-NEXT:    v_cmp_ne_u32_e64 s[0:1], v4, s0 
917+ ; GFX942-NEXT:    s_and_b64 vcc, exec, s[0:1] 
918+ ; GFX942-NEXT:    s_cbranch_vccnz .LBB5_10 
919+ ; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.shared 
920+ ; GFX942-NEXT:    ds_add_rtn_f64 v[2:3], v0, v[0:1] 
921+ ; GFX942-NEXT:    s_branch .LBB5_10 
922+ ; GFX942-NEXT:  .LBB5_3: ; %atomicrmw.check.private 
869923; GFX942-NEXT:    s_mov_b64 s[0:1], src_private_base 
870924; GFX942-NEXT:    s_mov_b32 s2, 32 
871925; GFX942-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2 
@@ -880,48 +934,52 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 {
880934; GFX942-NEXT:    v_cmp_ne_u32_e64 s[2:3], v2, s2 
881935; GFX942-NEXT:    s_and_b64 vcc, exec, s[2:3] 
882936; GFX942-NEXT:    ; implicit-def: $vgpr2_vgpr3 
883- ; GFX942-NEXT:    s_cbranch_vccnz .LBB5_2  
884- ; GFX942-NEXT:    s_branch .LBB5_3  
885- ; GFX942-NEXT:  .LBB5_1 : ; %atomicrmw.private 
937+ ; GFX942-NEXT:    s_cbranch_vccnz .LBB5_5  
938+ ; GFX942-NEXT:    s_branch .LBB5_6  
939+ ; GFX942-NEXT:  .LBB5_4 : ; %atomicrmw.private 
886940; GFX942-NEXT:    scratch_load_dwordx2 v[2:3], off, s0 
887941; GFX942-NEXT:    s_waitcnt vmcnt(0) 
888- ; GFX942-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1] 
889- ; GFX942-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 
890- ; GFX942-NEXT:    s_branch .LBB5_6 
891- ; GFX942-NEXT:  .LBB5_2: ; %atomicrmw.global 
942+ ; GFX942-NEXT:    v_add_f64 v[4:5], v[2:3], v[0:1] 
943+ ; GFX942-NEXT:    scratch_store_dwordx2 off, v[4:5], s0 
944+ ; GFX942-NEXT:    s_branch .LBB5_9 
945+ ; GFX942-NEXT:  .LBB5_5: ; %atomicrmw.global 
946+ ; GFX942-NEXT:    v_mov_b32_e32 v2, 0 
892947; GFX942-NEXT:    s_getpc_b64 s[0:1] 
893948; GFX942-NEXT:    s_add_u32 s0, s0, global@rel32@lo+4 
894949; GFX942-NEXT:    s_addc_u32 s1, s1, global@rel32@hi+12 
895- ; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[0:1] 
896- ; GFX942-NEXT:    flat_load_dwordx2 v[2:3], v[2:3] 
950+ ; GFX942-NEXT:    global_load_dwordx2 v[2:3], v2, s[0:1] 
897951; GFX942-NEXT:    s_mov_b64 s[0:1], 0 
898- ; GFX942-NEXT:    s_branch .LBB5_4  
899- ; GFX942-NEXT:  .LBB5_3 : ; %Flow 
952+ ; GFX942-NEXT:    s_branch .LBB5_7  
953+ ; GFX942-NEXT:  .LBB5_6 : ; %Flow 
900954; GFX942-NEXT:    s_and_b64 vcc, exec, s[0:1] 
901- ; GFX942-NEXT:    s_cbranch_vccnz .LBB5_1  
902- ; GFX942-NEXT:    s_branch .LBB5_6  
903- ; GFX942-NEXT:  .LBB5_4 : ; %atomicrmw.start 
955+ ; GFX942-NEXT:    s_cbranch_vccnz .LBB5_4  
956+ ; GFX942-NEXT:    s_branch .LBB5_9  
957+ ; GFX942-NEXT:  .LBB5_7 : ; %atomicrmw.start 
904958; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1 
905- ; GFX942-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)  
959+ ; GFX942-NEXT:    s_waitcnt vmcnt(0) 
906960; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[2:3] 
907961; GFX942-NEXT:    v_add_f64 v[2:3], v[4:5], v[0:1] 
908962; GFX942-NEXT:    s_getpc_b64 s[2:3] 
909963; GFX942-NEXT:    s_add_u32 s2, s2, global@rel32@lo+4 
910964; GFX942-NEXT:    s_addc_u32 s3, s3, global@rel32@hi+12 
911- ; GFX942-NEXT:    v_mov_b64_e32 v[6:7], s[2:3]  
912- ; GFX942-NEXT:    flat_atomic_cmpswap_x2  v[2:3], v[6:7 ], v [2:5 ] sc0 sc1 
913- ; GFX942-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)  
965+ ; GFX942-NEXT:    v_mov_b32_e32 v6, 0  
966+ ; GFX942-NEXT:    global_atomic_cmpswap_x2  v[2:3], v6, v[2:5 ], s [2:3 ] sc0 sc1 
967+ ; GFX942-NEXT:    s_waitcnt vmcnt(0) 
914968; GFX942-NEXT:    v_cmp_eq_u64_e64 s[2:3], v[2:3], v[4:5] 
915969; GFX942-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1] 
916970; GFX942-NEXT:    s_andn2_b64 exec, exec, s[0:1] 
917- ; GFX942-NEXT:    s_cbranch_execnz .LBB5_4  
918- ; GFX942-NEXT:  ; %bb.5 : ; %atomicrmw.end1 
971+ ; GFX942-NEXT:    s_cbranch_execnz .LBB5_7  
972+ ; GFX942-NEXT:  ; %bb.8 : ; %atomicrmw.end1 
919973; GFX942-NEXT:    s_or_b64 exec, exec, s[0:1] 
920974; GFX942-NEXT:    s_mov_b64 s[0:1], 0 
921- ; GFX942-NEXT:    s_branch .LBB5_3 
922- ; GFX942-NEXT:  .LBB5_6: ; %atomicrmw.phi 
923- ; GFX942-NEXT:  ; %bb.7: ; %atomicrmw.end 
975+ ; GFX942-NEXT:    s_branch .LBB5_6 
976+ ; GFX942-NEXT:  .LBB5_9: ; %Flow3 
977+ ; GFX942-NEXT:    s_mov_b64 s[0:1], 0 
978+ ; GFX942-NEXT:    s_branch .LBB5_1 
979+ ; GFX942-NEXT:  .LBB5_10: ; %atomicrmw.phi 
980+ ; GFX942-NEXT:  ; %bb.11: ; %atomicrmw.end 
924981; GFX942-NEXT:    s_mov_b32 s0, 32 
982+ ; GFX942-NEXT:    s_waitcnt lgkmcnt(0) 
925983; GFX942-NEXT:    v_lshrrev_b64 v[4:5], s0, v[2:3] 
926984; GFX942-NEXT:    v_mov_b32_e32 v0, v2 
927985; GFX942-NEXT:    v_mov_b32_e32 v1, v4 
0 commit comments