@@ -862,160 +862,138 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16
862862define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16 (ptr addrspace (1 ) %out , i16 zeroext %x.arg , i16 zeroext %y.arg , i16 zeroext %z.arg ) #0 {
863863; VI-DENORM-LABEL: multiple_use_fadd_multi_fmad_f16:
864864; VI-DENORM: ; %bb.0:
865- ; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
866- ; VI-DENORM-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
867- ; VI-DENORM-NEXT: s_load_dword s6, s[8:9], 0x8
865+ ; VI-DENORM-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
868866; VI-DENORM-NEXT: s_add_i32 s12, s12, s17
869867; VI-DENORM-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
868+ ; VI-DENORM-NEXT: s_mov_b32 flat_scratch_lo, s13
870869; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
871- ; VI-DENORM-NEXT: s_lshr_b32 s0, s0, 16
870+ ; VI-DENORM-NEXT: s_lshr_b32 s5, s2, 16
871+ ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s3
872+ ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s5
873+ ; VI-DENORM-NEXT: v_fma_f16 v2, |s2|, 2.0, v0
874+ ; VI-DENORM-NEXT: v_fma_f16 v3, |s2|, 2.0, v1
872875; VI-DENORM-NEXT: v_mov_b32_e32 v0, s0
873- ; VI-DENORM-NEXT: v_fma_f16 v2, |s6|, 2.0, v0
874- ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s1
875- ; VI-DENORM-NEXT: v_fma_f16 v3, |s6|, 2.0, v0
876- ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s2
877- ; VI-DENORM-NEXT: s_mov_b32 flat_scratch_lo, s13
878- ; VI-DENORM-NEXT: s_add_u32 s4, s2, 2
879- ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3
880- ; VI-DENORM-NEXT: s_addc_u32 s5, s3, 0
881- ; VI-DENORM-NEXT: flat_store_short v[0:1], v2
876+ ; VI-DENORM-NEXT: s_add_u32 s4, s0, 2
877+ ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1
878+ ; VI-DENORM-NEXT: s_addc_u32 s5, s1, 0
879+ ; VI-DENORM-NEXT: flat_store_short v[0:1], v3
882880; VI-DENORM-NEXT: s_waitcnt vmcnt(0)
883881; VI-DENORM-NEXT: v_mov_b32_e32 v0, s4
884882; VI-DENORM-NEXT: v_mov_b32_e32 v1, s5
885- ; VI-DENORM-NEXT: flat_store_short v[0:1], v3
883+ ; VI-DENORM-NEXT: flat_store_short v[0:1], v2
886884; VI-DENORM-NEXT: s_waitcnt vmcnt(0)
887885; VI-DENORM-NEXT: s_endpgm
888886;
889887; VI-FLUSH-LABEL: multiple_use_fadd_multi_fmad_f16:
890888; VI-FLUSH: ; %bb.0:
891- ; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
892- ; VI-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
893- ; VI-FLUSH-NEXT: s_load_dword s6, s[8:9], 0x8
889+ ; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
894890; VI-FLUSH-NEXT: s_add_i32 s12, s12, s17
895891; VI-FLUSH-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
892+ ; VI-FLUSH-NEXT: s_mov_b32 flat_scratch_lo, s13
896893; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
897- ; VI-FLUSH-NEXT: s_lshr_b32 s0, s0, 16
894+ ; VI-FLUSH-NEXT: s_lshr_b32 s5, s2, 16
895+ ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s3
896+ ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s5
897+ ; VI-FLUSH-NEXT: v_mad_f16 v2, |s2|, 2.0, v0
898+ ; VI-FLUSH-NEXT: v_mad_f16 v3, |s2|, 2.0, v1
898899; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0
899- ; VI-FLUSH-NEXT: v_mad_f16 v2, |s6|, 2.0, v0
900- ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s1
901- ; VI-FLUSH-NEXT: v_mad_f16 v3, |s6|, 2.0, v0
902- ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2
903- ; VI-FLUSH-NEXT: s_mov_b32 flat_scratch_lo, s13
904- ; VI-FLUSH-NEXT: s_add_u32 s4, s2, 2
905- ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
906- ; VI-FLUSH-NEXT: s_addc_u32 s5, s3, 0
907- ; VI-FLUSH-NEXT: flat_store_short v[0:1], v2
900+ ; VI-FLUSH-NEXT: s_add_u32 s4, s0, 2
901+ ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
902+ ; VI-FLUSH-NEXT: s_addc_u32 s5, s1, 0
903+ ; VI-FLUSH-NEXT: flat_store_short v[0:1], v3
908904; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
909905; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s4
910906; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s5
911- ; VI-FLUSH-NEXT: flat_store_short v[0:1], v3
907+ ; VI-FLUSH-NEXT: flat_store_short v[0:1], v2
912908; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
913909; VI-FLUSH-NEXT: s_endpgm
914910;
915911; GFX10-DENORM-LABEL: multiple_use_fadd_multi_fmad_f16:
916912; GFX10-DENORM: ; %bb.0:
917- ; GFX10-DENORM-NEXT: s_clause 0x2
918- ; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
919- ; GFX10-DENORM-NEXT: s_load_dword s4, s[8:9], 0x8
920- ; GFX10-DENORM-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
913+ ; GFX10-DENORM-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
921914; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0
922915; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
923- ; GFX10-DENORM-NEXT: s_lshr_b32 s0, s0 , 16
924- ; GFX10-DENORM-NEXT: v_fma_f16 v2, |s4 |, 2.0, s1
925- ; GFX10-DENORM-NEXT: v_fma_f16 v1, |s4 |, 2.0, s0
926- ; GFX10-DENORM-NEXT: global_store_short v0, v1, s[2:3 ]
916+ ; GFX10-DENORM-NEXT: s_lshr_b32 s4, s2 , 16
917+ ; GFX10-DENORM-NEXT: v_fma_f16 v2, |s2 |, 2.0, s3
918+ ; GFX10-DENORM-NEXT: v_fma_f16 v1, |s2 |, 2.0, s4
919+ ; GFX10-DENORM-NEXT: global_store_short v0, v1, s[0:1 ]
927920; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0
928- ; GFX10-DENORM-NEXT: global_store_short v0, v2, s[2:3 ] offset:2
921+ ; GFX10-DENORM-NEXT: global_store_short v0, v2, s[0:1 ] offset:2
929922; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0
930923; GFX10-DENORM-NEXT: s_endpgm
931924;
932925; GFX10-FLUSH-LABEL: multiple_use_fadd_multi_fmad_f16:
933926; GFX10-FLUSH: ; %bb.0:
934- ; GFX10-FLUSH-NEXT: s_clause 0x2
935- ; GFX10-FLUSH-NEXT: s_load_dword s4, s[8:9], 0x8
936- ; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
937- ; GFX10-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
927+ ; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
938928; GFX10-FLUSH-NEXT: v_mov_b32_e32 v1, 0
939929; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
940- ; GFX10-FLUSH-NEXT: v_add_f16_e64 v0, |s4 |, |s4 |
941- ; GFX10-FLUSH-NEXT: s_lshr_b32 s0, s0 , 16
942- ; GFX10-FLUSH-NEXT: v_add_f16_e32 v2, s0 , v0
943- ; GFX10-FLUSH-NEXT: v_add_f16_e32 v0, s1 , v0
944- ; GFX10-FLUSH-NEXT: global_store_short v1, v2, s[2:3 ]
930+ ; GFX10-FLUSH-NEXT: v_add_f16_e64 v0, |s2 |, |s2 |
931+ ; GFX10-FLUSH-NEXT: s_lshr_b32 s2, s2 , 16
932+ ; GFX10-FLUSH-NEXT: v_add_f16_e32 v2, s2 , v0
933+ ; GFX10-FLUSH-NEXT: v_add_f16_e32 v0, s3 , v0
934+ ; GFX10-FLUSH-NEXT: global_store_short v1, v2, s[0:1 ]
945935; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0
946- ; GFX10-FLUSH-NEXT: global_store_short v1, v0, s[2:3 ] offset:2
936+ ; GFX10-FLUSH-NEXT: global_store_short v1, v0, s[0:1 ] offset:2
947937; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0
948938; GFX10-FLUSH-NEXT: s_endpgm
949939;
950940; GFX11-DENORM-TRUE16-LABEL: multiple_use_fadd_multi_fmad_f16:
951941; GFX11-DENORM-TRUE16: ; %bb.0:
952- ; GFX11-DENORM-TRUE16-NEXT: s_clause 0x2
953- ; GFX11-DENORM-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
954- ; GFX11-DENORM-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x8
955- ; GFX11-DENORM-TRUE16-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
942+ ; GFX11-DENORM-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
956943; GFX11-DENORM-TRUE16-NEXT: v_mov_b32_e32 v1, 0
957944; GFX11-DENORM-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
958- ; GFX11-DENORM-TRUE16-NEXT: s_lshr_b32 s0, s0 , 16
959- ; GFX11-DENORM-TRUE16-NEXT: v_fma_f16 v0.h, |s6 |, 2.0, s1
960- ; GFX11-DENORM-TRUE16-NEXT: v_fma_f16 v0.l, |s6 |, 2.0, s0
961- ; GFX11-DENORM-TRUE16-NEXT: global_store_b16 v1, v0, s[2:3 ] dlc
945+ ; GFX11-DENORM-TRUE16-NEXT: s_lshr_b32 s4, s2 , 16
946+ ; GFX11-DENORM-TRUE16-NEXT: v_fma_f16 v0.h, |s2 |, 2.0, s3
947+ ; GFX11-DENORM-TRUE16-NEXT: v_fma_f16 v0.l, |s2 |, 2.0, s4
948+ ; GFX11-DENORM-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1 ] dlc
962949; GFX11-DENORM-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
963- ; GFX11-DENORM-TRUE16-NEXT: global_store_d16_hi_b16 v1, v0, s[2:3 ] offset:2 dlc
950+ ; GFX11-DENORM-TRUE16-NEXT: global_store_d16_hi_b16 v1, v0, s[0:1 ] offset:2 dlc
964951; GFX11-DENORM-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
965952; GFX11-DENORM-TRUE16-NEXT: s_endpgm
966953;
967954; GFX11-DENORM-FAKE16-LABEL: multiple_use_fadd_multi_fmad_f16:
968955; GFX11-DENORM-FAKE16: ; %bb.0:
969- ; GFX11-DENORM-FAKE16-NEXT: s_clause 0x2
970- ; GFX11-DENORM-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
971- ; GFX11-DENORM-FAKE16-NEXT: s_load_b32 s6, s[4:5], 0x8
972- ; GFX11-DENORM-FAKE16-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
956+ ; GFX11-DENORM-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
973957; GFX11-DENORM-FAKE16-NEXT: v_mov_b32_e32 v0, 0
974958; GFX11-DENORM-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
975- ; GFX11-DENORM-FAKE16-NEXT: s_lshr_b32 s0, s0 , 16
976- ; GFX11-DENORM-FAKE16-NEXT: v_fma_f16 v2, |s6 |, 2.0, s1
977- ; GFX11-DENORM-FAKE16-NEXT: v_fma_f16 v1, |s6 |, 2.0, s0
978- ; GFX11-DENORM-FAKE16-NEXT: global_store_b16 v0, v1, s[2:3 ] dlc
959+ ; GFX11-DENORM-FAKE16-NEXT: s_lshr_b32 s4, s2 , 16
960+ ; GFX11-DENORM-FAKE16-NEXT: v_fma_f16 v2, |s2 |, 2.0, s3
961+ ; GFX11-DENORM-FAKE16-NEXT: v_fma_f16 v1, |s2 |, 2.0, s4
962+ ; GFX11-DENORM-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1 ] dlc
979963; GFX11-DENORM-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
980- ; GFX11-DENORM-FAKE16-NEXT: global_store_b16 v0, v2, s[2:3 ] offset:2 dlc
964+ ; GFX11-DENORM-FAKE16-NEXT: global_store_b16 v0, v2, s[0:1 ] offset:2 dlc
981965; GFX11-DENORM-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
982966; GFX11-DENORM-FAKE16-NEXT: s_endpgm
983967;
984968; GFX11-FLUSH-TRUE16-LABEL: multiple_use_fadd_multi_fmad_f16:
985969; GFX11-FLUSH-TRUE16: ; %bb.0:
986- ; GFX11-FLUSH-TRUE16-NEXT: s_clause 0x2
987- ; GFX11-FLUSH-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x8
988- ; GFX11-FLUSH-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
989- ; GFX11-FLUSH-TRUE16-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
970+ ; GFX11-FLUSH-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
990971; GFX11-FLUSH-TRUE16-NEXT: v_mov_b32_e32 v1, 0
991972; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
992- ; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e64 v0.l, |s6 |, |s6 |
993- ; GFX11-FLUSH-TRUE16-NEXT: s_lshr_b32 s0, s0 , 16
973+ ; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e64 v0.l, |s2 |, |s2 |
974+ ; GFX11-FLUSH-TRUE16-NEXT: s_lshr_b32 s2, s2 , 16
994975; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
995- ; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.h, s0 , v0.l
996- ; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, s1 , v0.l
997- ; GFX11-FLUSH-TRUE16-NEXT: global_store_d16_hi_b16 v1, v0, s[2:3 ] dlc
976+ ; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.h, s2 , v0.l
977+ ; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, s3 , v0.l
978+ ; GFX11-FLUSH-TRUE16-NEXT: global_store_d16_hi_b16 v1, v0, s[0:1 ] dlc
998979; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
999- ; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v1, v0, s[2:3 ] offset:2 dlc
980+ ; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1 ] offset:2 dlc
1000981; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
1001982; GFX11-FLUSH-TRUE16-NEXT: s_endpgm
1002983;
1003984; GFX11-FLUSH-FAKE16-LABEL: multiple_use_fadd_multi_fmad_f16:
1004985; GFX11-FLUSH-FAKE16: ; %bb.0:
1005- ; GFX11-FLUSH-FAKE16-NEXT: s_clause 0x2
1006- ; GFX11-FLUSH-FAKE16-NEXT: s_load_b32 s6, s[4:5], 0x8
1007- ; GFX11-FLUSH-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
1008- ; GFX11-FLUSH-FAKE16-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
986+ ; GFX11-FLUSH-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
1009987; GFX11-FLUSH-FAKE16-NEXT: v_mov_b32_e32 v1, 0
1010988; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
1011- ; GFX11-FLUSH-FAKE16-NEXT: v_add_f16_e64 v0, |s6 |, |s6 |
1012- ; GFX11-FLUSH-FAKE16-NEXT: s_lshr_b32 s0, s0 , 16
989+ ; GFX11-FLUSH-FAKE16-NEXT: v_add_f16_e64 v0, |s2 |, |s2 |
990+ ; GFX11-FLUSH-FAKE16-NEXT: s_lshr_b32 s2, s2 , 16
1013991; GFX11-FLUSH-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
1014- ; GFX11-FLUSH-FAKE16-NEXT: v_add_f16_e32 v2, s0 , v0
1015- ; GFX11-FLUSH-FAKE16-NEXT: v_add_f16_e32 v0, s1 , v0
1016- ; GFX11-FLUSH-FAKE16-NEXT: global_store_b16 v1, v2, s[2:3 ] dlc
992+ ; GFX11-FLUSH-FAKE16-NEXT: v_add_f16_e32 v2, s2 , v0
993+ ; GFX11-FLUSH-FAKE16-NEXT: v_add_f16_e32 v0, s3 , v0
994+ ; GFX11-FLUSH-FAKE16-NEXT: global_store_b16 v1, v2, s[0:1 ] dlc
1017995; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
1018- ; GFX11-FLUSH-FAKE16-NEXT: global_store_b16 v1, v0, s[2:3 ] offset:2 dlc
996+ ; GFX11-FLUSH-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1 ] offset:2 dlc
1019997; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
1020998; GFX11-FLUSH-FAKE16-NEXT: s_endpgm
1021999 %x = bitcast i16 %x.arg to half
0 commit comments