diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 7988a9ac0ce55..f48d1d8c011da 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -255,6 +255,12 @@ class FLAT_Store_Pseudo { + def "" : FLAT_Store_Pseudo; + let OtherPredicates = [HasTrue16BitInsts] in + def _t16 : FLAT_Store_Pseudo, True16D16Table; +} + multiclass FLAT_Global_Load_Pseudo { let is_flat_global = 1 in { def "" : FLAT_Load_Pseudo, @@ -264,6 +270,21 @@ multiclass FLAT_Global_Load_Pseudo { + defm "" : FLAT_Global_Load_Pseudo; + + defvar Name16 = opName#"_t16"; + let OtherPredicates = [HasTrue16BitInsts], + SubtargetPredicate = HasFlatGlobalInsts, is_flat_global = 1 in { + def _t16 : FLAT_Load_Pseudo, + GlobalSaddrTable<0, Name16>, + True16D16Table; + def _SADDR_t16 : FLAT_Load_Pseudo, + GlobalSaddrTable<1, Name16>, + True16D16Table; + } +} + class FLAT_Global_Load_AddTid_Pseudo : FLAT_Pseudo< opName, @@ -300,6 +321,21 @@ multiclass FLAT_Global_Store_Pseudo { } } +multiclass FLAT_Global_Store_Pseudo_t16 { + defm "" : FLAT_Global_Store_Pseudo; + + defvar Name16 = opName#"_t16"; + let OtherPredicates = [HasTrue16BitInsts], + SubtargetPredicate = HasFlatGlobalInsts, is_flat_global = 1 in { + def _t16 : FLAT_Store_Pseudo, + GlobalSaddrTable<0, Name16>, + True16D16Table; + def _SADDR_t16 : FLAT_Store_Pseudo, + GlobalSaddrTable<1, Name16>, + True16D16Table; + } +} + class FLAT_Global_Load_LDS_Pseudo : FLAT_Pseudo< opName, (outs ), @@ -456,6 +492,29 @@ multiclass FLAT_Scratch_Load_Pseudo; } +multiclass FLAT_Scratch_Load_Pseudo_t16 { + defm "" : FLAT_Scratch_Load_Pseudo; + + defvar Name16 = opName#"_t16"; + let OtherPredicates = [HasTrue16BitInsts], is_flat_scratch = 1 in { + def _t16 : FLAT_Scratch_Load_Pseudo, + FlatScratchInst, + True16D16Table; + def _SADDR_t16 : FLAT_Scratch_Load_Pseudo, + FlatScratchInst, + True16D16Table; + let SubtargetPredicate = HasFlatScratchSVSMode in + def _SVS_t16 : FLAT_Scratch_Load_Pseudo, + FlatScratchInst, + True16D16Table; + + let SubtargetPredicate = HasFlatScratchSTMode in + def _ST_t16 : FLAT_Scratch_Load_Pseudo, + FlatScratchInst, + True16D16Table; + } +} + multiclass FLAT_Scratch_Store_Pseudo { def "" : FLAT_Scratch_Store_Pseudo, FlatScratchInst; @@ -471,6 +530,31 @@ multiclass FLAT_Scratch_Store_Pseudo { FlatScratchInst; } +multiclass FLAT_Scratch_Store_Pseudo_t16 { + defm "" : FLAT_Scratch_Store_Pseudo; + + defvar Name16 = opName#"_t16"; + let OtherPredicates = [HasTrue16BitInsts], is_flat_scratch = 1 in { + def _t16 : FLAT_Scratch_Store_Pseudo, + FlatScratchInst, + True16D16Table; + def _SADDR_t16 : FLAT_Scratch_Store_Pseudo, + FlatScratchInst, + True16D16Table; + + let SubtargetPredicate = HasFlatScratchSVSMode in + def _SVS_t16 : FLAT_Scratch_Store_Pseudo, + FlatScratchInst, + True16D16Table; + + let SubtargetPredicate = HasFlatScratchSTMode in + def _ST_t16 : FLAT_Scratch_Store_Pseudo, + FlatScratchInst, + True16D16Table; + } +} + + class FLAT_Scratch_Load_LDS_Pseudo : FLAT_Pseudo< @@ -665,8 +749,6 @@ def FLAT_LOAD_DWORDX2 : FLAT_Load_Pseudo <"flat_load_dwordx2", VReg_64>; def FLAT_LOAD_DWORDX4 : FLAT_Load_Pseudo <"flat_load_dwordx4", VReg_128>; def FLAT_LOAD_DWORDX3 : FLAT_Load_Pseudo <"flat_load_dwordx3", VReg_96>; -def FLAT_STORE_BYTE : FLAT_Store_Pseudo <"flat_store_byte", VGPR_32>; -def FLAT_STORE_SHORT : FLAT_Store_Pseudo <"flat_store_short", VGPR_32>; def FLAT_STORE_DWORD : FLAT_Store_Pseudo <"flat_store_dword", VGPR_32>; def FLAT_STORE_DWORDX2 : FLAT_Store_Pseudo <"flat_store_dwordx2", VReg_64>; def FLAT_STORE_DWORDX4 : FLAT_Store_Pseudo <"flat_store_dwordx4", VReg_128>; @@ -686,6 +768,9 @@ def FLAT_STORE_BYTE_D16_HI : FLAT_Store_Pseudo <"flat_store_byte_d16_hi", VGPR_ def FLAT_STORE_SHORT_D16_HI : FLAT_Store_Pseudo <"flat_store_short_d16_hi", VGPR_32>; } +defm FLAT_STORE_BYTE : FLAT_Store_Pseudo_t16 <"flat_store_byte">; +defm FLAT_STORE_SHORT : FLAT_Store_Pseudo_t16 <"flat_store_short">; + defm FLAT_ATOMIC_CMPSWAP : FLAT_Atomic_Pseudo <"flat_atomic_cmpswap", VGPR_32, i32, v2i32, VReg_64>; @@ -834,19 +919,22 @@ defm GLOBAL_LOAD_DWORDX3 : FLAT_Global_Load_Pseudo <"global_load_dwordx3", VReg defm GLOBAL_LOAD_DWORDX4 : FLAT_Global_Load_Pseudo <"global_load_dwordx4", VReg_128>; let TiedSourceNotRead = 1 in { -defm GLOBAL_LOAD_UBYTE_D16 : FLAT_Global_Load_Pseudo <"global_load_ubyte_d16", VGPR_32, 1>; -defm GLOBAL_LOAD_UBYTE_D16_HI : FLAT_Global_Load_Pseudo <"global_load_ubyte_d16_hi", VGPR_32, 1>; -defm GLOBAL_LOAD_SBYTE_D16 : FLAT_Global_Load_Pseudo <"global_load_sbyte_d16", VGPR_32, 1>; defm GLOBAL_LOAD_SBYTE_D16_HI : FLAT_Global_Load_Pseudo <"global_load_sbyte_d16_hi", VGPR_32, 1>; -defm GLOBAL_LOAD_SHORT_D16 : FLAT_Global_Load_Pseudo <"global_load_short_d16", VGPR_32, 1>; defm GLOBAL_LOAD_SHORT_D16_HI : FLAT_Global_Load_Pseudo <"global_load_short_d16_hi", VGPR_32, 1>; +defm GLOBAL_LOAD_UBYTE_D16_HI : FLAT_Global_Load_Pseudo <"global_load_ubyte_d16_hi", VGPR_32, 1>; +defm GLOBAL_LOAD_SBYTE_D16 : FLAT_Global_Load_Pseudo_t16 <"global_load_sbyte_d16">; +defm GLOBAL_LOAD_SHORT_D16 : FLAT_Global_Load_Pseudo_t16 <"global_load_short_d16">; +defm GLOBAL_LOAD_UBYTE_D16 : FLAT_Global_Load_Pseudo_t16 <"global_load_ubyte_d16">; } +defm GLOBAL_STORE_BYTE_D16_HI : FLAT_Global_Store_Pseudo <"global_store_byte_d16_hi", VGPR_32>; +defm GLOBAL_STORE_SHORT_D16_HI : FLAT_Global_Store_Pseudo <"global_store_short_d16_hi", VGPR_32>; + let OtherPredicates = [HasGFX10_BEncoding] in defm GLOBAL_LOAD_DWORD_ADDTID : FLAT_Global_Load_AddTid_Pseudo <"global_load_dword_addtid", VGPR_32>; -defm GLOBAL_STORE_BYTE : FLAT_Global_Store_Pseudo <"global_store_byte", VGPR_32>; -defm GLOBAL_STORE_SHORT : FLAT_Global_Store_Pseudo <"global_store_short", VGPR_32>; +defm GLOBAL_STORE_BYTE : FLAT_Global_Store_Pseudo_t16 <"global_store_byte">; +defm GLOBAL_STORE_SHORT : FLAT_Global_Store_Pseudo_t16 <"global_store_short">; defm GLOBAL_STORE_DWORD : FLAT_Global_Store_Pseudo <"global_store_dword", VGPR_32>; defm GLOBAL_STORE_DWORDX2 : FLAT_Global_Store_Pseudo <"global_store_dwordx2", VReg_64>; defm GLOBAL_STORE_DWORDX3 : FLAT_Global_Store_Pseudo <"global_store_dwordx3", VReg_96>; @@ -854,9 +942,6 @@ defm GLOBAL_STORE_DWORDX4 : FLAT_Global_Store_Pseudo <"global_store_dwordx4", VR let OtherPredicates = [HasGFX10_BEncoding] in defm GLOBAL_STORE_DWORD_ADDTID : FLAT_Global_Store_AddTid_Pseudo <"global_store_dword_addtid", VGPR_32>; -defm GLOBAL_STORE_BYTE_D16_HI : FLAT_Global_Store_Pseudo <"global_store_byte_d16_hi", VGPR_32>; -defm GLOBAL_STORE_SHORT_D16_HI : FLAT_Global_Store_Pseudo <"global_store_short_d16_hi", VGPR_32>; - defm GLOBAL_ATOMIC_CMPSWAP : FLAT_Global_Atomic_Pseudo <"global_atomic_cmpswap", VGPR_32, i32, v2i32, VReg_64>; @@ -970,24 +1055,24 @@ defm SCRATCH_LOAD_DWORDX3 : FLAT_Scratch_Load_Pseudo <"scratch_load_dwordx3", V defm SCRATCH_LOAD_DWORDX4 : FLAT_Scratch_Load_Pseudo <"scratch_load_dwordx4", VReg_128>; let TiedSourceNotRead = 1 in { -defm SCRATCH_LOAD_UBYTE_D16 : FLAT_Scratch_Load_Pseudo <"scratch_load_ubyte_d16", VGPR_32, 1>; defm SCRATCH_LOAD_UBYTE_D16_HI : FLAT_Scratch_Load_Pseudo <"scratch_load_ubyte_d16_hi", VGPR_32, 1>; -defm SCRATCH_LOAD_SBYTE_D16 : FLAT_Scratch_Load_Pseudo <"scratch_load_sbyte_d16", VGPR_32, 1>; defm SCRATCH_LOAD_SBYTE_D16_HI : FLAT_Scratch_Load_Pseudo <"scratch_load_sbyte_d16_hi", VGPR_32, 1>; -defm SCRATCH_LOAD_SHORT_D16 : FLAT_Scratch_Load_Pseudo <"scratch_load_short_d16", VGPR_32, 1>; defm SCRATCH_LOAD_SHORT_D16_HI : FLAT_Scratch_Load_Pseudo <"scratch_load_short_d16_hi", VGPR_32, 1>; +defm SCRATCH_LOAD_UBYTE_D16 : FLAT_Scratch_Load_Pseudo_t16 <"scratch_load_ubyte_d16">; +defm SCRATCH_LOAD_SBYTE_D16 : FLAT_Scratch_Load_Pseudo_t16 <"scratch_load_sbyte_d16">; +defm SCRATCH_LOAD_SHORT_D16 : FLAT_Scratch_Load_Pseudo_t16 <"scratch_load_short_d16">; } -defm SCRATCH_STORE_BYTE : FLAT_Scratch_Store_Pseudo <"scratch_store_byte", VGPR_32>; -defm SCRATCH_STORE_SHORT : FLAT_Scratch_Store_Pseudo <"scratch_store_short", VGPR_32>; +defm SCRATCH_STORE_BYTE_D16_HI : FLAT_Scratch_Store_Pseudo <"scratch_store_byte_d16_hi", VGPR_32>; +defm SCRATCH_STORE_SHORT_D16_HI : FLAT_Scratch_Store_Pseudo <"scratch_store_short_d16_hi", VGPR_32>; + +defm SCRATCH_STORE_BYTE : FLAT_Scratch_Store_Pseudo_t16 <"scratch_store_byte">; +defm SCRATCH_STORE_SHORT : FLAT_Scratch_Store_Pseudo_t16 <"scratch_store_short">; defm SCRATCH_STORE_DWORD : FLAT_Scratch_Store_Pseudo <"scratch_store_dword", VGPR_32>; defm SCRATCH_STORE_DWORDX2 : FLAT_Scratch_Store_Pseudo <"scratch_store_dwordx2", VReg_64>; defm SCRATCH_STORE_DWORDX3 : FLAT_Scratch_Store_Pseudo <"scratch_store_dwordx3", VReg_96>; defm SCRATCH_STORE_DWORDX4 : FLAT_Scratch_Store_Pseudo <"scratch_store_dwordx4", VReg_128>; -defm SCRATCH_STORE_BYTE_D16_HI : FLAT_Scratch_Store_Pseudo <"scratch_store_byte_d16_hi", VGPR_32>; -defm SCRATCH_STORE_SHORT_D16_HI : FLAT_Scratch_Store_Pseudo <"scratch_store_short_d16_hi", VGPR_32>; - defm SCRATCH_LOAD_LDS_UBYTE : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_ubyte">; defm SCRATCH_LOAD_LDS_SBYTE : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_sbyte">; defm SCRATCH_LOAD_LDS_USHORT : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_ushort">; @@ -1071,11 +1156,21 @@ class FlatSignedLoadPat_D16 ; +class FlatSignedLoadPat_D16_t16 : GCNPat < + (vt (node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset))), + (inst $vaddr, $offset, (i32 0)) +>; + class GlobalLoadSaddrPat_D16 : GCNPat < (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), vt:$in)), (inst $saddr, $voffset, $offset, 0, $in) >; +class GlobalLoadSaddrPat_D16_t16 : GCNPat < + (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset))), + (inst $saddr, $voffset, $offset, (i32 0)) +>; + class FlatLoadSignedPat : GCNPat < (vt (node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset))), (inst $vaddr, $offset) @@ -1208,6 +1303,11 @@ class ScratchLoadSignedPat_D16 ; +class ScratchLoadSignedPat_D16_t16 : GCNPat < + (vt (node (ScratchOffset (i32 VGPR_32:$vaddr), i32:$offset))), + (inst $vaddr, $offset, 0) +>; + class ScratchStoreSignedPat : GCNPat < (node vt:$data, (ScratchOffset (i32 VGPR_32:$vaddr), i32:$offset)), (inst getVregSrcForVT.ret:$data, $vaddr, $offset) @@ -1223,6 +1323,11 @@ class ScratchLoadSaddrPat_D16 ; +class ScratchLoadSaddrPat_D16_t16 : GCNPat < + (vt (node (ScratchSAddr (i32 SGPR_32:$saddr), i32:$offset))), + (inst $saddr, $offset, 0) +>; + class ScratchStoreSaddrPat : GCNPat < (node vt:$data, (ScratchSAddr (i32 SGPR_32:$saddr), i32:$offset)), @@ -1245,6 +1350,11 @@ class ScratchLoadSVaddrPat_D16 ; +class ScratchLoadSVaddrPat_D16_t16 : GCNPat < + (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset))), + (inst $vaddr, $saddr, $offset, 0) +>; + multiclass GlobalFLATLoadPats { def : FlatLoadSignedPat { let AddedComplexity = 10; @@ -1265,6 +1375,16 @@ multiclass GlobalFLATLoadPats_D16 { + def : FlatSignedLoadPat_D16_t16(inst#"_t16"), node, vt> { + let AddedComplexity = 10; + } + + def : GlobalLoadSaddrPat_D16_t16(inst#"_SADDR_t16"), node, vt> { + let AddedComplexity = 11; + } +} + multiclass GlobalFLATStorePats { def : FlatStoreSignedPat { @@ -1276,6 +1396,16 @@ multiclass GlobalFLATStorePats { + def : FlatStoreSignedPat(inst#"_t16"), node, vt> { + let AddedComplexity = 10; + } + + def : GlobalStoreSaddrPat(inst#"_SADDR_t16"), node, vt> { + let AddedComplexity = 11; + } +} + multiclass GlobalFLATAtomicPatsNoRtnBase { let AddedComplexity = 11 in @@ -1358,6 +1488,22 @@ multiclass ScratchFLATStorePats { + def : ScratchStoreSignedPat (inst#"_t16"), node, vt> { + let AddedComplexity = 25; + } + + def : ScratchStoreSaddrPat(inst#"_SADDR_t16"), node, vt> { + let AddedComplexity = 26; + } + + def : ScratchStoreSVaddrPat(inst#"_SVS_t16"), node, vt> { + let SubtargetPredicate = HasFlatScratchSVSMode; + let AddedComplexity = 27; + } +} + multiclass ScratchFLATLoadPats_D16 { def : ScratchLoadSignedPat_D16 { let AddedComplexity = 25; @@ -1373,6 +1519,21 @@ multiclass ScratchFLATLoadPats_D16 { + def : ScratchLoadSignedPat_D16_t16 (inst#"_t16"), node, vt> { + let AddedComplexity = 25; + } + + def : ScratchLoadSaddrPat_D16_t16(inst#"_SADDR_t16"), node, vt> { + let AddedComplexity = 26; + } + + def : ScratchLoadSVaddrPat_D16_t16 (inst#"_SVS_t16"), node, vt> { + let SubtargetPredicate = HasFlatScratchSVSMode; + let AddedComplexity = 27; + } +} + let OtherPredicates = [HasFlatAddressSpace] in { def : FlatLoadPat ; @@ -1409,6 +1570,8 @@ let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predi def : FlatLoadPat_D16_t16; def : FlatLoadPat_D16_t16; def : FlatLoadPat_D16_t16; + def : FlatStorePat ; + def : FlatStorePat ; } // End let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts def : FlatLoadPat ; @@ -1489,9 +1652,6 @@ let SubtargetPredicate = isGFX12Plus in { defm : FlatAtomicNoRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>; } -def : FlatStorePat ; -def : FlatStorePat ; - let OtherPredicates = [HasD16LoadStore] in { def : FlatStorePat ; def : FlatStorePat ; @@ -1531,15 +1691,28 @@ defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; -defm : GlobalFLATLoadPats ; -defm : GlobalFLATLoadPats ; -defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; + +foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in +let OtherPredicates = [HasFlatGlobalInsts], True16Predicate = p in { +defm : GlobalFLATLoadPats ; +defm : GlobalFLATLoadPats ; +defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; +} + +let OtherPredicates = [HasFlatGlobalInsts, D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts in { +defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", extloadi8_global, i16>; +defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", zextloadi8_global, i16>; +defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_SBYTE_D16", sextloadi8_global, i16>; +defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_SHORT_D16", load_global, i16>; +defm : GlobalFLATStorePats_D16_t16<"GLOBAL_STORE_BYTE", truncstorei8_global, i16>; +defm : GlobalFLATStorePats_D16_t16<"GLOBAL_STORE_SHORT", store_global, i16>; +} // end OtherPredicates = [HasFlatGlobalInsts, D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts foreach vt = Reg32Types.types in { defm : GlobalFLATLoadPats ; @@ -1565,11 +1738,15 @@ defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; defm : GlobalFLATStorePats ; -defm : GlobalFLATStorePats ; defm : GlobalFLATStorePats ; -defm : GlobalFLATStorePats ; defm : GlobalFLATStorePats ; +foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in +let OtherPredicates = [HasFlatGlobalInsts], True16Predicate = p in { +defm : GlobalFLATStorePats ; +defm : GlobalFLATStorePats ; +} + let OtherPredicates = [HasD16LoadStore] in { defm : GlobalFLATStorePats ; defm : GlobalFLATStorePats ; @@ -1715,13 +1892,28 @@ let OtherPredicates = [HasFlatScratchInsts, EnableFlatScratch] in { defm : ScratchFLATLoadPats ; defm : ScratchFLATLoadPats ; defm : ScratchFLATLoadPats ; -defm : ScratchFLATLoadPats ; -defm : ScratchFLATLoadPats ; -defm : ScratchFLATLoadPats ; defm : ScratchFLATLoadPats ; defm : ScratchFLATLoadPats ; defm : ScratchFLATLoadPats ; + +foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in +let True16Predicate = p in { +defm : ScratchFLATLoadPats ; +defm : ScratchFLATLoadPats ; +defm : ScratchFLATLoadPats ; defm : ScratchFLATLoadPats ; +defm : ScratchFLATStorePats ; +defm : ScratchFLATStorePats ; +} + +let True16Predicate = UseRealTrue16Insts in { +defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_UBYTE_D16", extloadi8_private, i16>; +defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_UBYTE_D16", zextloadi8_private, i16>; +defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_SBYTE_D16", sextloadi8_private, i16>; +defm : ScratchFLATLoadPats_D16_t16<"SCRATCH_LOAD_SHORT_D16", load_private, i16>; +defm : ScratchFLATStorePats_t16 <"SCRATCH_STORE_SHORT", store_private, i16>; +defm : ScratchFLATStorePats_t16 <"SCRATCH_STORE_BYTE", truncstorei8_private, i16>; +} // End True16Predicate = UseRealTrue16Insts foreach vt = Reg32Types.types in { defm : ScratchFLATLoadPats ; @@ -1741,9 +1933,7 @@ defm : ScratchFLATStorePats ; } defm : ScratchFLATStorePats ; -defm : ScratchFLATStorePats ; defm : ScratchFLATStorePats ; -defm : ScratchFLATStorePats ; defm : ScratchFLATStorePats ; let OtherPredicates = [HasD16LoadStore, HasFlatScratchInsts, EnableFlatScratch] in { diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index 7bd4d616b2181..b6572231ea2e6 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -60,13 +60,21 @@ define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; GFX10-NEXT: global_store_short v[2:3], v0, off ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: test_load_store: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_u16 v0, v[0:1], off -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_store_b16 v[2:3], v0, off -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11TRUE16-LABEL: test_load_store: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11TRUE16-NEXT: global_store_b16 v[2:3], v0, off +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: test_load_store: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: global_load_u16 v0, v[0:1], off +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11FAKE16-NEXT: global_store_b16 v[2:3], v0, off +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %val = load bfloat, ptr addrspace(1) %in store bfloat %val, ptr addrspace(1) %out ret void @@ -2127,14 +2135,23 @@ define void @test_store_fpimm(ptr addrspace(1) %ptr0, ptr addrspace(1) %ptr1) { ; GFX10-NEXT: global_store_short v[2:3], v5, off ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: test_store_fpimm: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, 0x3f80 -; GFX11-NEXT: v_mov_b32_e32 v5, 0x4228 -; GFX11-NEXT: global_store_b16 v[0:1], v4, off -; GFX11-NEXT: global_store_b16 v[2:3], v5, off -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11TRUE16-LABEL: test_store_fpimm: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, 0x3f80 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.h, 0x4228 +; GFX11TRUE16-NEXT: global_store_b16 v[0:1], v4, off +; GFX11TRUE16-NEXT: global_store_d16_hi_b16 v[2:3], v4, off +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: test_store_fpimm: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: v_mov_b32_e32 v4, 0x3f80 +; GFX11FAKE16-NEXT: v_mov_b32_e32 v5, 0x4228 +; GFX11FAKE16-NEXT: global_store_b16 v[0:1], v4, off +; GFX11FAKE16-NEXT: global_store_b16 v[2:3], v5, off +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] store bfloat 1.0, ptr addrspace(1) %ptr0 store bfloat 42.0, ptr addrspace(1) %ptr1 ret void @@ -3330,12 +3347,19 @@ define amdgpu_gfx void @test_inreg_arg_store(bfloat inreg %in, ptr addrspace(1) ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: test_inreg_arg_store: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, s4 -; GFX11-NEXT: global_store_b16 v[0:1], v2, off -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11TRUE16-LABEL: test_inreg_arg_store: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, s4 +; GFX11TRUE16-NEXT: global_store_b16 v[0:1], v2, off +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: test_inreg_arg_store: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: v_mov_b32_e32 v2, s4 +; GFX11FAKE16-NEXT: global_store_b16 v[0:1], v2, off +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] store bfloat %in, ptr addrspace(1) %out ret void } @@ -3379,11 +3403,18 @@ define bfloat @test_byval(ptr addrspace(5) byval(bfloat) %bv, bfloat %val) { ; GFX10-NEXT: buffer_store_short v0, off, s[0:3], s32 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: test_byval: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: scratch_store_b16 off, v0, s32 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11TRUE16-LABEL: test_byval: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11TRUE16-NEXT: scratch_store_b16 off, v1, s32 +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: test_byval: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: scratch_store_b16 off, v0, s32 +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] store bfloat %val, ptr addrspace(5) %bv %retval = load bfloat, ptr addrspace(5) %bv ret bfloat %retval @@ -3490,13 +3521,21 @@ define void @test_bitcast_from_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %ou ; GFX10-NEXT: global_store_short v[2:3], v0, off ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: test_bitcast_from_bfloat: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_u16 v0, v[0:1], off -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_store_b16 v[2:3], v0, off -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11TRUE16-LABEL: test_bitcast_from_bfloat: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11TRUE16-NEXT: global_store_b16 v[2:3], v0, off +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: test_bitcast_from_bfloat: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: global_load_u16 v0, v[0:1], off +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11FAKE16-NEXT: global_store_b16 v[2:3], v0, off +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %val = load bfloat, ptr addrspace(1) %in %val_int = bitcast bfloat %val to i16 store i16 %val_int, ptr addrspace(1) %out @@ -3556,13 +3595,21 @@ define void @test_bitcast_to_bfloat(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: test_bitcast_to_bfloat: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_u16 v2, v[2:3], off -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_store_b16 v[0:1], v2, off -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11TRUE16-LABEL: test_bitcast_to_bfloat: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: global_load_d16_b16 v2, v[2:3], off +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11TRUE16-NEXT: global_store_b16 v[0:1], v2, off +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: test_bitcast_to_bfloat: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: global_load_u16 v2, v[2:3], off +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11FAKE16-NEXT: global_store_b16 v[0:1], v2, off +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %val = load i16, ptr addrspace(1) %in %val_fp = bitcast i16 %val to bfloat store bfloat %val_fp, ptr addrspace(1) %out @@ -5309,14 +5356,23 @@ define bfloat @test_alloca_load_store_ret(bfloat %in) { ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: test_alloca_load_store_ret: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: scratch_store_b16 off, v0, s32 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: scratch_load_u16 v0, off, s32 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11TRUE16-LABEL: test_alloca_load_store_ret: +; GFX11TRUE16: ; %bb.0: ; %entry +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: scratch_store_b16 off, v0, s32 dlc +; GFX11TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 glc dlc +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: test_alloca_load_store_ret: +; GFX11FAKE16: ; %bb.0: ; %entry +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: scratch_store_b16 off, v0, s32 dlc +; GFX11FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11FAKE16-NEXT: scratch_load_u16 v0, off, s32 glc dlc +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %in.addr = alloca bfloat, align 2, addrspace(5) store volatile bfloat %in, ptr addrspace(5) %in.addr, align 2 @@ -5667,26 +5723,48 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) { ; GFX10-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:128 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: test_overflow_stack: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: s_clause 0x5 -; GFX11-NEXT: scratch_store_b128 v0, v[22:25], off offset:80 -; GFX11-NEXT: scratch_store_b128 v0, v[18:21], off offset:64 -; GFX11-NEXT: scratch_store_b128 v0, v[14:17], off offset:48 -; GFX11-NEXT: scratch_store_b128 v0, v[10:13], off offset:32 -; GFX11-NEXT: scratch_store_b128 v0, v[6:9], off offset:16 -; GFX11-NEXT: scratch_store_b128 v0, v[2:5], off -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: scratch_store_b128 v0, v[30:33], off offset:112 -; GFX11-NEXT: scratch_store_b128 v0, v[26:29], off offset:96 -; GFX11-NEXT: scratch_store_b16 v0, v1, off offset:128 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11TRUE16-LABEL: test_overflow_stack: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: s_clause 0x2 +; GFX11TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11TRUE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11TRUE16-NEXT: s_clause 0x3 +; GFX11TRUE16-NEXT: scratch_store_b128 v0, v[22:25], off offset:80 +; GFX11TRUE16-NEXT: scratch_store_b128 v0, v[18:21], off offset:64 +; GFX11TRUE16-NEXT: scratch_store_b128 v0, v[14:17], off offset:48 +; GFX11TRUE16-NEXT: scratch_store_b128 v0, v[10:13], off offset:32 +; GFX11TRUE16-NEXT: s_clause 0x1 +; GFX11TRUE16-NEXT: scratch_store_b128 v0, v[6:9], off offset:16 +; GFX11TRUE16-NEXT: scratch_store_b128 v0, v[2:5], off +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11TRUE16-NEXT: s_clause 0x2 +; GFX11TRUE16-NEXT: scratch_store_b128 v0, v[30:33], off offset:112 +; GFX11TRUE16-NEXT: scratch_store_b128 v0, v[26:29], off offset:96 +; GFX11TRUE16-NEXT: scratch_store_b16 v0, v1, off offset:128 +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: test_overflow_stack: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: s_clause 0x2 +; GFX11FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11FAKE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11FAKE16-NEXT: s_clause 0x5 +; GFX11FAKE16-NEXT: scratch_store_b128 v0, v[22:25], off offset:80 +; GFX11FAKE16-NEXT: scratch_store_b128 v0, v[18:21], off offset:64 +; GFX11FAKE16-NEXT: scratch_store_b128 v0, v[14:17], off offset:48 +; GFX11FAKE16-NEXT: scratch_store_b128 v0, v[10:13], off offset:32 +; GFX11FAKE16-NEXT: scratch_store_b128 v0, v[6:9], off offset:16 +; GFX11FAKE16-NEXT: scratch_store_b128 v0, v[2:5], off +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11FAKE16-NEXT: s_clause 0x2 +; GFX11FAKE16-NEXT: scratch_store_b128 v0, v[30:33], off offset:112 +; GFX11FAKE16-NEXT: scratch_store_b128 v0, v[26:29], off offset:96 +; GFX11FAKE16-NEXT: scratch_store_b16 v0, v1, off offset:128 +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %ins.0 = insertvalue { <32 x i32>, bfloat } poison, <32 x i32> %b, 0 %ins.1 = insertvalue { <32 x i32>, bfloat } %ins.0 ,bfloat %a, 1 ret { <32 x i32>, bfloat } %ins.1 @@ -42719,7 +42797,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11TRUE16-NEXT: s_clause 0x1f -; GFX11TRUE16-NEXT: scratch_load_u16 v31, off, s32 +; GFX11TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 ; GFX11TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:64 ; GFX11TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:128 ; GFX11TRUE16-NEXT: scratch_load_b32 v34, off, s32 offset:60 @@ -42752,9 +42830,9 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX11TRUE16-NEXT: scratch_load_b32 v85, off, s32 offset:72 ; GFX11TRUE16-NEXT: scratch_load_b32 v86, off, s32 offset:4 ; GFX11TRUE16-NEXT: scratch_load_b32 v87, off, s32 offset:68 +; GFX11TRUE16-NEXT: v_and_b32_e32 v16, 1, v16 ; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11TRUE16-NEXT: v_and_b32_e32 v14, 1, v14 -; GFX11TRUE16-NEXT: v_and_b32_e32 v16, 1, v16 ; GFX11TRUE16-NEXT: v_and_b32_e32 v18, 1, v18 ; GFX11TRUE16-NEXT: v_and_b32_e32 v20, 1, v20 ; GFX11TRUE16-NEXT: v_and_b32_e32 v22, 1, v22 @@ -42762,6 +42840,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX11TRUE16-NEXT: v_and_b32_e32 v26, 1, v26 ; GFX11TRUE16-NEXT: v_and_b32_e32 v28, 1, v28 ; GFX11TRUE16-NEXT: v_and_b32_e32 v30, 1, v30 +; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s15, 1, v16 ; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3 @@ -42785,7 +42864,6 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX11TRUE16-NEXT: v_and_b32_e32 v29, 1, v29 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s13, 1, v14 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s15, 1, v16 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s17, 1, v18 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s19, 1, v20 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s21, 1, v22 @@ -42815,7 +42893,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s26, 1, v27 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s29, 1, v29 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(32) -; GFX11TRUE16-NEXT: v_and_b32_e32 v16, 1, v31 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v16.l, v31.l ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(31) ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v32 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(30) @@ -42823,6 +42901,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX11TRUE16-NEXT: v_cndmask_b16 v15.l, v33.l, v32.l, s28 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(29) ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v34 +; GFX11TRUE16-NEXT: v_and_b32_e32 v16, 1, v16 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(28) ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v35 ; GFX11TRUE16-NEXT: v_cndmask_b16 v14.l, v35.l, v34.l, s27 diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll index f1ff511f53749..b6eca494fe690 100644 --- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -48,15 +48,25 @@ define <2 x half> @chain_hi_to_lo_private() { ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) ; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: chain_hi_to_lo_private: -; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, 2 -; GFX11-NEXT: scratch_load_u16 v0, off, s0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: scratch_load_d16_hi_b16 v0, off, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: chain_hi_to_lo_private: +; GFX11-TRUE16: ; %bb.0: ; %bb +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 2 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v0, off, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: chain_hi_to_lo_private: +; GFX11-FAKE16: ; %bb.0: ; %bb +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 2 +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, off, s0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 +; GFX11-FAKE16-NEXT: scratch_load_d16_hi_b16 v0, off, s0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] bb: %gep_lo = getelementptr inbounds half, ptr addrspace(5) null, i64 1 %load_lo = load half, ptr addrspace(5) %gep_lo @@ -104,13 +114,21 @@ define <2 x half> @chain_hi_to_lo_private_different_bases(ptr addrspace(5) %base ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) ; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: chain_hi_to_lo_private_different_bases: -; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: scratch_load_u16 v0, v0, off -; GFX11-NEXT: scratch_load_d16_hi_b16 v0, v1, off -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: chain_hi_to_lo_private_different_bases: +; GFX11-TRUE16: ; %bb.0: ; %bb +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, v0, off +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v0, v1, off +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: chain_hi_to_lo_private_different_bases: +; GFX11-FAKE16: ; %bb.0: ; %bb +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: scratch_load_u16 v0, v0, off +; GFX11-FAKE16-NEXT: scratch_load_d16_hi_b16 v0, v1, off +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] bb: %load_lo = load half, ptr addrspace(5) %base_lo %load_hi = load half, ptr addrspace(5) %base_hi @@ -288,17 +306,29 @@ define <2 x half> @chain_hi_to_lo_global() { ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: chain_hi_to_lo_global: -; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, 2 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: global_load_u16 v0, v[0:1], off -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: global_load_d16_hi_b16 v0, v[1:2], off -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: chain_hi_to_lo_global: +; GFX11-TRUE16: ; %bb.0: ; %bb +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-TRUE16-NEXT: global_load_d16_hi_b16 v0, v[1:2], off +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: chain_hi_to_lo_global: +; GFX11-FAKE16: ; %bb.0: ; %bb +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-FAKE16-NEXT: global_load_u16 v0, v[0:1], off +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-FAKE16-NEXT: global_load_d16_hi_b16 v0, v[1:2], off +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] bb: %gep_lo = getelementptr inbounds half, ptr addrspace(1) null, i64 1 %load_lo = load half, ptr addrspace(1) %gep_lo @@ -328,13 +358,21 @@ define <2 x half> @chain_hi_to_lo_global_different_bases(ptr addrspace(1) %base_ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: chain_hi_to_lo_global_different_bases: -; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_u16 v0, v[0:1], off -; GFX11-NEXT: global_load_d16_hi_b16 v0, v[2:3], off -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: chain_hi_to_lo_global_different_bases: +; GFX11-TRUE16: ; %bb.0: ; %bb +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off +; GFX11-TRUE16-NEXT: global_load_d16_hi_b16 v0, v[2:3], off +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: chain_hi_to_lo_global_different_bases: +; GFX11-FAKE16: ; %bb.0: ; %bb +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_u16 v0, v[0:1], off +; GFX11-FAKE16-NEXT: global_load_d16_hi_b16 v0, v[2:3], off +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] bb: %load_lo = load half, ptr addrspace(1) %base_lo %load_hi = load half, ptr addrspace(1) %base_hi @@ -592,25 +630,23 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_u16 v0, v2, s[0:1] +; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[0:1] ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: scratch_store_b16 off, v0, off dlc ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_load_u16 v0, v2, s[0:1] offset:2 +; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[0:1] offset:2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: scratch_store_b16 off, v0, off offset:2 dlc ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_load_u16 v0, v2, s[0:1] offset:4 +; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[0:1] offset:4 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: scratch_store_b16 off, v0, off offset:4 dlc ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_u16 v3, off, off offset:2 -; GFX11-TRUE16-NEXT: scratch_load_u16 v0, off, off -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v3 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v0, off, off offset:2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, off ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v1, off, off offset:4 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[2:3] @@ -865,17 +901,30 @@ define <2 x i16> @chain_hi_to_lo_global_other_dep(ptr addrspace(1) %ptr) { ; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v2, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: chain_hi_to_lo_global_other_dep: -; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_u16 v2, v[0:1], off offset:2 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_d16_hi_b16 v0, v[0:1], off glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v2, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: chain_hi_to_lo_global_other_dep: +; GFX11-TRUE16: ; %bb.0: ; %bb +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_d16_b16 v2, v[0:1], off offset:2 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: global_load_d16_hi_b16 v0, v[0:1], off glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: chain_hi_to_lo_global_other_dep: +; GFX11-FAKE16: ; %bb.0: ; %bb +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_u16 v2, v[0:1], off offset:2 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: global_load_d16_hi_b16 v0, v[0:1], off glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] bb: %gep_lo = getelementptr inbounds i16, ptr addrspace(1) %ptr, i64 1 %load_lo = load volatile i16, ptr addrspace(1) %gep_lo diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll index 768c6c0ac7a29..e72f3d3ce993a 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -48,13 +48,22 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16(ptr addrspace( ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; -; GFX11-LABEL: test_fold_canonicalize_undef_value_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_store_b16 v0, v0, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: test_fold_canonicalize_undef_value_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: test_fold_canonicalize_undef_value_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_store_b16 v0, v0, s[0:1] +; GFX11-FAKE16-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half undef) store half %canonicalized, ptr addrspace(1) %out ret void @@ -102,7 +111,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_f16(ptr addrspace(1) %out) #1 ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_u16 v0, v0, s[0:1] +; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[0:1] ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l ; GFX11-TRUE16-NEXT: global_store_b16 v[0:1], v0, off @@ -277,7 +286,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(ptr addrspace(1) %ou ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] +; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_max_f16_e64 v0.l, |v0.l|, |v0.l| ; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] @@ -342,7 +351,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(ptr addrspace(1 ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] +; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_max_f16_e64 v0.l, -|v0.l|, -|v0.l| ; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] @@ -408,7 +417,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(ptr addrspace(1) %ou ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] +; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_max_f16_e64 v0.l, -v0.l, -v0.l ; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] @@ -473,7 +482,7 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(ptr add ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] +; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_max_f16_e64 v0.l, -v0.l, -v0.l ; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] @@ -538,7 +547,7 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(pt ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] +; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_max_f16_e64 v0.l, -|v0.l|, -|v0.l| ; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] @@ -591,13 +600,22 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f16(ptr addrspace(1) %out) ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; -; GFX11-LABEL: test_fold_canonicalize_p0_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_store_b16 v0, v0, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: test_fold_canonicalize_p0_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: test_fold_canonicalize_p0_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_store_b16 v0, v0, s[0:1] +; GFX11-FAKE16-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half 0.0) store half %canonicalized, ptr addrspace(1) %out ret void @@ -633,13 +651,22 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f16(ptr addrspace(1) %out) ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; -; GFX11-LABEL: test_fold_canonicalize_n0_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff8000 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: test_fold_canonicalize_n0_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x8000 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: test_fold_canonicalize_n0_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff8000 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FAKE16-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half -0.0) store half %canonicalized, ptr addrspace(1) %out ret void @@ -675,13 +702,22 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f16(ptr addrspace(1) %out) ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; -; GFX11-LABEL: test_fold_canonicalize_p1_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3c00 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: test_fold_canonicalize_p1_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x3c00 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: test_fold_canonicalize_p1_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3c00 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FAKE16-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half 1.0) store half %canonicalized, ptr addrspace(1) %out ret void @@ -717,13 +753,22 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f16(ptr addrspace(1) %out) ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; -; GFX11-LABEL: test_fold_canonicalize_n1_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffffbc00 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: test_fold_canonicalize_n1_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0xbc00 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: test_fold_canonicalize_n1_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffffbc00 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FAKE16-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half -1.0) store half %canonicalized, ptr addrspace(1) %out ret void @@ -759,13 +804,22 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f16(ptr addrspace(1) % ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; -; GFX11-LABEL: test_fold_canonicalize_literal_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x4c00 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: test_fold_canonicalize_literal_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x4c00 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: test_fold_canonicalize_literal_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x4c00 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FAKE16-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half 16.0) store half %canonicalized, ptr addrspace(1) %out ret void @@ -801,13 +855,22 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f1 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; -; GFX11-LABEL: test_default_denormals_fold_canonicalize_denormal0_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: test_default_denormals_fold_canonicalize_denormal0_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x3ff +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: test_default_denormals_fold_canonicalize_denormal0_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FAKE16-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half 0xH03FF) store half %canonicalized, ptr addrspace(1) %out ret void @@ -843,13 +906,22 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(ptr ad ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; -; GFX11-LABEL: test_denormals_fold_canonicalize_denormal0_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: test_denormals_fold_canonicalize_denormal0_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x3ff +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: test_denormals_fold_canonicalize_denormal0_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FAKE16-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half 0xH03FF) store half %canonicalized, ptr addrspace(1) %out ret void @@ -885,13 +957,22 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f1 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; -; GFX11-LABEL: test_default_denormals_fold_canonicalize_denormal1_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: test_default_denormals_fold_canonicalize_denormal1_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x83ff +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: test_default_denormals_fold_canonicalize_denormal1_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FAKE16-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half 0xH83FF) store half %canonicalized, ptr addrspace(1) %out ret void @@ -927,13 +1008,22 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(ptr ad ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; -; GFX11-LABEL: test_denormals_fold_canonicalize_denormal1_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: test_denormals_fold_canonicalize_denormal1_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x83ff +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: test_denormals_fold_canonicalize_denormal1_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FAKE16-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half 0xH83FF) store half %canonicalized, ptr addrspace(1) %out ret void @@ -969,13 +1059,22 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(ptr addrspace(1) %out ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; -; GFX11-LABEL: test_fold_canonicalize_qnan_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c00 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: test_fold_canonicalize_qnan_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x7c00 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: test_fold_canonicalize_qnan_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c00 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FAKE16-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half 0xH7C00) store half %canonicalized, ptr addrspace(1) %out ret void @@ -1011,13 +1110,22 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(ptr addrsp ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; -; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg1_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: test_fold_canonicalize_qnan_value_neg1_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x7e00 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: test_fold_canonicalize_qnan_value_neg1_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FAKE16-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 -1 to half)) store half %canonicalized, ptr addrspace(1) %out ret void @@ -1053,13 +1161,22 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(ptr addrsp ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; -; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg2_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: test_fold_canonicalize_qnan_value_neg2_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x7e00 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: test_fold_canonicalize_qnan_value_neg2_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FAKE16-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 -2 to half)) store half %canonicalized, ptr addrspace(1) %out ret void @@ -1095,13 +1212,22 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(ptr addrspace( ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; -; GFX11-LABEL: test_fold_canonicalize_snan0_value_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: test_fold_canonicalize_snan0_value_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x7e00 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: test_fold_canonicalize_snan0_value_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FAKE16-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half 0xH7C01) store half %canonicalized, ptr addrspace(1) %out ret void @@ -1137,13 +1263,22 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(ptr addrspace( ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; -; GFX11-LABEL: test_fold_canonicalize_snan1_value_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: test_fold_canonicalize_snan1_value_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x7e00 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: test_fold_canonicalize_snan1_value_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FAKE16-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half 0xH7DFF) store half %canonicalized, ptr addrspace(1) %out ret void @@ -1179,13 +1314,22 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(ptr addrspace( ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; -; GFX11-LABEL: test_fold_canonicalize_snan2_value_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: test_fold_canonicalize_snan2_value_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x7e00 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: test_fold_canonicalize_snan2_value_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FAKE16-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half 0xHFDFF) store half %canonicalized, ptr addrspace(1) %out ret void @@ -1221,13 +1365,22 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(ptr addrspace( ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; -; GFX11-LABEL: test_fold_canonicalize_snan3_value_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: test_fold_canonicalize_snan3_value_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x7e00 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: test_fold_canonicalize_snan3_value_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FAKE16-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half 0xHFC01) store half %canonicalized, ptr addrspace(1) %out ret void diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll index c39a03ee8008c..4cfdb968f7090 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -1,14 +1,18 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck --check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck --check-prefix=GFX10 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck --check-prefix=GFX11 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck --check-prefix=GFX12 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-promote-alloca -mattr=+enable-flat-scratch,+real-true16 < %s | FileCheck --check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-promote-alloca -mattr=+enable-flat-scratch,-real-true16 < %s | FileCheck --check-prefixes=GFX11,GFX11-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-promote-alloca -mattr=+enable-flat-scratch,+real-true16 < %s | FileCheck --check-prefixes=GFX12,GFX12-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-promote-alloca -mattr=+enable-flat-scratch,-real-true16 < %s | FileCheck --check-prefixes=GFX12,GFX12-FAKE16 %s ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck --check-prefix=GFX9-PAL %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -mattr=-promote-alloca < %s | FileCheck -check-prefixes=GFX942 %s ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX10-PAL,GFX1010-PAL %s ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX10-PAL,GFX1030-PAL %s -; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX11-PAL %s -; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX12-PAL %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=-promote-alloca -mattr=+enable-flat-scratch,+real-true16 < %s | FileCheck --check-prefixes=GFX11-PAL,GFX11-PAL-TRUE16 %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=-promote-alloca -mattr=+enable-flat-scratch,-real-true16 < %s | FileCheck --check-prefixes=GFX11-PAL,GFX11-PAL-FAKE16 %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=-promote-alloca -mattr=+enable-flat-scratch,+real-true16 < %s | FileCheck --check-prefixes=GFX12-PAL,GFX12-PAL-TRUE16 %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=-promote-alloca -mattr=+enable-flat-scratch,-real-true16 < %s | FileCheck --check-prefixes=GFX12-PAL,GFX12-PAL-FAKE16 %s define amdgpu_kernel void @zero_init_kernel() { ; GFX9-LABEL: zero_init_kernel: @@ -4452,30 +4456,55 @@ define void @store_load_i32_negative_unaligned(ptr addrspace(5) nocapture %arg) ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: store_load_i32_negative_unaligned: -; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, 1 -; GFX11-NEXT: scratch_store_b8 v0, v1, off offset:-1 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: scratch_load_u8 v0, v0, off offset:-1 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: store_load_i32_negative_unaligned: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, 1 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: scratch_store_b8 v0, v1, off offset:-1 scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: scratch_load_u8 v0, v0, off offset:-1 scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: store_load_i32_negative_unaligned: +; GFX11-TRUE16: ; %bb.0: ; %bb +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, 1 +; GFX11-TRUE16-NEXT: scratch_store_b8 v0, v1, off offset:-1 dlc +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: scratch_load_d16_u8 v0, v0, off offset:-1 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: store_load_i32_negative_unaligned: +; GFX11-FAKE16: ; %bb.0: ; %bb +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 1 +; GFX11-FAKE16-NEXT: scratch_store_b8 v0, v1, off offset:-1 dlc +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: scratch_load_u8 v0, v0, off offset:-1 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-TRUE16-LABEL: store_load_i32_negative_unaligned: +; GFX12-TRUE16: ; %bb.0: ; %bb +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, 1 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: scratch_store_b8 v0, v1, off offset:-1 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: scratch_load_d16_u8 v0, v0, off offset:-1 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: store_load_i32_negative_unaligned: +; GFX12-FAKE16: ; %bb.0: ; %bb +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, 1 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: scratch_store_b8 v0, v1, off offset:-1 scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: scratch_load_u8 v0, v0, off offset:-1 scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-PAL-LABEL: store_load_i32_negative_unaligned: ; GFX9-PAL: ; %bb.0: ; %bb @@ -4520,30 +4549,55 @@ define void @store_load_i32_negative_unaligned(ptr addrspace(5) nocapture %arg) ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX1030-PAL-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-PAL-LABEL: store_load_i32_negative_unaligned: -; GFX11-PAL: ; %bb.0: ; %bb -; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 1 -; GFX11-PAL-NEXT: scratch_store_b8 v0, v1, off offset:-1 dlc -; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-PAL-NEXT: scratch_load_u8 v0, v0, off offset:-1 glc dlc -; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-PAL-LABEL: store_load_i32_negative_unaligned: -; GFX12-PAL: ; %bb.0: ; %bb -; GFX12-PAL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-PAL-NEXT: s_wait_expcnt 0x0 -; GFX12-PAL-NEXT: s_wait_samplecnt 0x0 -; GFX12-PAL-NEXT: s_wait_bvhcnt 0x0 -; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 -; GFX12-PAL-NEXT: v_mov_b32_e32 v1, 1 -; GFX12-PAL-NEXT: s_wait_storecnt 0x0 -; GFX12-PAL-NEXT: scratch_store_b8 v0, v1, off offset:-1 scope:SCOPE_SYS -; GFX12-PAL-NEXT: s_wait_storecnt 0x0 -; GFX12-PAL-NEXT: scratch_load_u8 v0, v0, off offset:-1 scope:SCOPE_SYS -; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 -; GFX12-PAL-NEXT: s_setpc_b64 s[30:31] +; GFX11-PAL-TRUE16-LABEL: store_load_i32_negative_unaligned: +; GFX11-PAL-TRUE16: ; %bb.0: ; %bb +; GFX11-PAL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-PAL-TRUE16-NEXT: v_mov_b16_e32 v1.l, 1 +; GFX11-PAL-TRUE16-NEXT: scratch_store_b8 v0, v1, off offset:-1 dlc +; GFX11-PAL-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-PAL-TRUE16-NEXT: scratch_load_d16_u8 v0, v0, off offset:-1 glc dlc +; GFX11-PAL-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-PAL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-PAL-FAKE16-LABEL: store_load_i32_negative_unaligned: +; GFX11-PAL-FAKE16: ; %bb.0: ; %bb +; GFX11-PAL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-PAL-FAKE16-NEXT: v_mov_b32_e32 v1, 1 +; GFX11-PAL-FAKE16-NEXT: scratch_store_b8 v0, v1, off offset:-1 dlc +; GFX11-PAL-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-PAL-FAKE16-NEXT: scratch_load_u8 v0, v0, off offset:-1 glc dlc +; GFX11-PAL-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-PAL-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-PAL-TRUE16-LABEL: store_load_i32_negative_unaligned: +; GFX12-PAL-TRUE16: ; %bb.0: ; %bb +; GFX12-PAL-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-PAL-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-PAL-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-PAL-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-PAL-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-PAL-TRUE16-NEXT: v_mov_b16_e32 v1.l, 1 +; GFX12-PAL-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-PAL-TRUE16-NEXT: scratch_store_b8 v0, v1, off offset:-1 scope:SCOPE_SYS +; GFX12-PAL-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-PAL-TRUE16-NEXT: scratch_load_d16_u8 v0, v0, off offset:-1 scope:SCOPE_SYS +; GFX12-PAL-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-PAL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-PAL-FAKE16-LABEL: store_load_i32_negative_unaligned: +; GFX12-PAL-FAKE16: ; %bb.0: ; %bb +; GFX12-PAL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-PAL-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-PAL-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-PAL-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-PAL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-PAL-FAKE16-NEXT: v_mov_b32_e32 v1, 1 +; GFX12-PAL-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-PAL-FAKE16-NEXT: scratch_store_b8 v0, v1, off offset:-1 scope:SCOPE_SYS +; GFX12-PAL-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-PAL-FAKE16-NEXT: scratch_load_u8 v0, v0, off offset:-1 scope:SCOPE_SYS +; GFX12-PAL-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-PAL-FAKE16-NEXT: s_setpc_b64 s[30:31] bb: %ptr = getelementptr inbounds i8, ptr addrspace(5) %arg, i32 -1 store volatile i8 1, ptr addrspace(5) %ptr, align 1 @@ -4574,30 +4628,56 @@ define void @store_load_i32_large_negative_unaligned(ptr addrspace(5) nocapture ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: store_load_i32_large_negative_unaligned: -; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_add_nc_u32 v0, 0xfffff000, v0 -; GFX11-NEXT: scratch_store_b8 v0, v1, off offset:-129 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: scratch_load_u8 v0, v0, off offset:-129 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: store_load_i32_large_negative_unaligned: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v1, 1 -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: scratch_store_b8 v0, v1, off offset:-4225 scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: scratch_load_u8 v0, v0, off offset:-4225 scope:SCOPE_SYS -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: store_load_i32_large_negative_unaligned: +; GFX11-TRUE16: ; %bb.0: ; %bb +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffff000, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 1 +; GFX11-TRUE16-NEXT: scratch_store_b8 v1, v0, off offset:-129 dlc +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: scratch_load_d16_u8 v0, v1, off offset:-129 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: store_load_i32_large_negative_unaligned: +; GFX11-FAKE16: ; %bb.0: ; %bb +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_add_nc_u32 v0, 0xfffff000, v0 +; GFX11-FAKE16-NEXT: scratch_store_b8 v0, v1, off offset:-129 dlc +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: scratch_load_u8 v0, v0, off offset:-129 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-TRUE16-LABEL: store_load_i32_large_negative_unaligned: +; GFX12-TRUE16: ; %bb.0: ; %bb +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, 1 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: scratch_store_b8 v0, v1, off offset:-4225 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: scratch_load_d16_u8 v0, v0, off offset:-4225 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: store_load_i32_large_negative_unaligned: +; GFX12-FAKE16: ; %bb.0: ; %bb +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, 1 +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: scratch_store_b8 v0, v1, off offset:-4225 scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-FAKE16-NEXT: scratch_load_u8 v0, v0, off offset:-4225 scope:SCOPE_SYS +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-PAL-LABEL: store_load_i32_large_negative_unaligned: ; GFX9-PAL: ; %bb.0: ; %bb @@ -4643,30 +4723,56 @@ define void @store_load_i32_large_negative_unaligned(ptr addrspace(5) nocapture ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX1030-PAL-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-PAL-LABEL: store_load_i32_large_negative_unaligned: -; GFX11-PAL: ; %bb.0: ; %bb -; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_add_nc_u32 v0, 0xfffff000, v0 -; GFX11-PAL-NEXT: scratch_store_b8 v0, v1, off offset:-129 dlc -; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-PAL-NEXT: scratch_load_u8 v0, v0, off offset:-129 glc dlc -; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-PAL-LABEL: store_load_i32_large_negative_unaligned: -; GFX12-PAL: ; %bb.0: ; %bb -; GFX12-PAL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-PAL-NEXT: s_wait_expcnt 0x0 -; GFX12-PAL-NEXT: s_wait_samplecnt 0x0 -; GFX12-PAL-NEXT: s_wait_bvhcnt 0x0 -; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 -; GFX12-PAL-NEXT: v_mov_b32_e32 v1, 1 -; GFX12-PAL-NEXT: s_wait_storecnt 0x0 -; GFX12-PAL-NEXT: scratch_store_b8 v0, v1, off offset:-4225 scope:SCOPE_SYS -; GFX12-PAL-NEXT: s_wait_storecnt 0x0 -; GFX12-PAL-NEXT: scratch_load_u8 v0, v0, off offset:-4225 scope:SCOPE_SYS -; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 -; GFX12-PAL-NEXT: s_setpc_b64 s[30:31] +; GFX11-PAL-TRUE16-LABEL: store_load_i32_large_negative_unaligned: +; GFX11-PAL-TRUE16: ; %bb.0: ; %bb +; GFX11-PAL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-PAL-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffff000, v0 +; GFX11-PAL-TRUE16-NEXT: v_mov_b16_e32 v0.l, 1 +; GFX11-PAL-TRUE16-NEXT: scratch_store_b8 v1, v0, off offset:-129 dlc +; GFX11-PAL-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-PAL-TRUE16-NEXT: scratch_load_d16_u8 v0, v1, off offset:-129 glc dlc +; GFX11-PAL-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-PAL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-PAL-FAKE16-LABEL: store_load_i32_large_negative_unaligned: +; GFX11-PAL-FAKE16: ; %bb.0: ; %bb +; GFX11-PAL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-PAL-FAKE16-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_add_nc_u32 v0, 0xfffff000, v0 +; GFX11-PAL-FAKE16-NEXT: scratch_store_b8 v0, v1, off offset:-129 dlc +; GFX11-PAL-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-PAL-FAKE16-NEXT: scratch_load_u8 v0, v0, off offset:-129 glc dlc +; GFX11-PAL-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-PAL-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-PAL-TRUE16-LABEL: store_load_i32_large_negative_unaligned: +; GFX12-PAL-TRUE16: ; %bb.0: ; %bb +; GFX12-PAL-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-PAL-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-PAL-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-PAL-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-PAL-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-PAL-TRUE16-NEXT: v_mov_b16_e32 v1.l, 1 +; GFX12-PAL-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-PAL-TRUE16-NEXT: scratch_store_b8 v0, v1, off offset:-4225 scope:SCOPE_SYS +; GFX12-PAL-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-PAL-TRUE16-NEXT: scratch_load_d16_u8 v0, v0, off offset:-4225 scope:SCOPE_SYS +; GFX12-PAL-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-PAL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-PAL-FAKE16-LABEL: store_load_i32_large_negative_unaligned: +; GFX12-PAL-FAKE16: ; %bb.0: ; %bb +; GFX12-PAL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-PAL-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-PAL-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-PAL-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-PAL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-PAL-FAKE16-NEXT: v_mov_b32_e32 v1, 1 +; GFX12-PAL-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-PAL-FAKE16-NEXT: scratch_store_b8 v0, v1, off offset:-4225 scope:SCOPE_SYS +; GFX12-PAL-FAKE16-NEXT: s_wait_storecnt 0x0 +; GFX12-PAL-FAKE16-NEXT: scratch_load_u8 v0, v0, off offset:-4225 scope:SCOPE_SYS +; GFX12-PAL-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-PAL-FAKE16-NEXT: s_setpc_b64 s[30:31] bb: %ptr = getelementptr inbounds i8, ptr addrspace(5) %arg, i32 -4225 store volatile i8 1, ptr addrspace(5) %ptr, align 1 diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll index f490ecf68d984..f72b01b8b6426 100644 --- a/llvm/test/CodeGen/AMDGPU/fmed3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll @@ -7568,7 +7568,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3] +; GFX11-SDAG-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-TRUE16-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l ; GFX11-SDAG-TRUE16-NEXT: v_med3_f16 v0.l, v0.l, 2.0, 4.0 @@ -7816,16 +7816,13 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: global_load_u16 v0, v2, s[2:3] glc dlc +; GFX11-SDAG-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] glc dlc ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: global_load_u16 v1, v2, s[4:5] glc dlc +; GFX11-SDAG-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5] glc dlc ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: global_load_u16 v3, v2, s[6:7] glc dlc +; GFX11-SDAG-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[6:7] glc dlc ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-TRUE16-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-SDAG-TRUE16-NEXT: v_add_f16_e32 v0.h, 2.0, v0.h ; GFX11-SDAG-TRUE16-NEXT: v_add_f16_e32 v1.l, 4.0, v1.l ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) diff --git a/llvm/test/CodeGen/AMDGPU/icmp.i16.ll b/llvm/test/CodeGen/AMDGPU/icmp.i16.ll index 790a457c2b337..77575c78fb349 100644 --- a/llvm/test/CodeGen/AMDGPU/icmp.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/icmp.i16.ll @@ -13,7 +13,7 @@ ; VI: v_cmp_eq_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_eq_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} ; GFX11-FAKE16: v_cmp_eq_u16_e32 vcc_lo, v{{[0-9]+}}, v{{[0-9]+}} -; GFX11-TRUE16: v_cmp_eq_u16_e32 vcc_lo, v{{[0-9]+}}.l, v{{[0-9]+}}.h +; GFX11-TRUE16: v_cmp_eq_u16_e32 vcc_lo, v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}} define amdgpu_kernel void @i16_eq(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -33,7 +33,7 @@ entry: ; VI: v_cmp_ne_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_ne_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} ; GFX11-FAKE16: v_cmp_ne_u16_e32 vcc_lo, v{{[0-9]+}}, v{{[0-9]+}} -; GFX11-TRUE16: v_cmp_ne_u16_e32 vcc_lo, v{{[0-9]+}}.l, v{{[0-9]+}}.h +; GFX11-TRUE16: v_cmp_ne_u16_e32 vcc_lo, v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}} define amdgpu_kernel void @i16_ne(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -53,7 +53,7 @@ entry: ; VI: v_cmp_gt_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_gt_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} ; GFX11-FAKE16: v_cmp_gt_u16_e32 vcc_lo, v{{[0-9]+}}, v{{[0-9]+}} -; GFX11-TRUE16: v_cmp_gt_u16_e32 vcc_lo, v{{[0-9]+}}.l, v{{[0-9]+}}.h +; GFX11-TRUE16: v_cmp_gt_u16_e32 vcc_lo, v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}} define amdgpu_kernel void @i16_ugt(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -73,7 +73,7 @@ entry: ; VI: v_cmp_ge_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_ge_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} ; GFX11-FAKE16: v_cmp_ge_u16_e32 vcc_lo, v{{[0-9]+}}, v{{[0-9]+}} -; GFX11-TRUE16: v_cmp_ge_u16_e32 vcc_lo, v{{[0-9]+}}.l, v{{[0-9]+}}.h +; GFX11-TRUE16: v_cmp_ge_u16_e32 vcc_lo, v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}} define amdgpu_kernel void @i16_uge(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -93,7 +93,7 @@ entry: ; VI: v_cmp_lt_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_lt_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} ; GFX11-FAKE16: v_cmp_lt_u16_e32 vcc_lo, v{{[0-9]+}}, v{{[0-9]+}} -; GFX11-TRUE16: v_cmp_lt_u16_e32 vcc_lo, v{{[0-9]+}}.l, v{{[0-9]+}}.h +; GFX11-TRUE16: v_cmp_lt_u16_e32 vcc_lo, v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}} define amdgpu_kernel void @i16_ult(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -113,7 +113,7 @@ entry: ; VI: v_cmp_le_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_le_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} ; GFX11-FAKE16: v_cmp_le_u16_e32 vcc_lo, v{{[0-9]+}}, v{{[0-9]+}} -; GFX11-TRUE16: v_cmp_le_u16_e32 vcc_lo, v{{[0-9]+}}.l, v{{[0-9]+}}.h +; GFX11-TRUE16: v_cmp_le_u16_e32 vcc_lo, v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}} define amdgpu_kernel void @i16_ule(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -134,7 +134,7 @@ entry: ; VI: v_cmp_gt_i16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_gt_i32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} ; GFX11-FAKE16: v_cmp_gt_i16_e32 vcc_lo, v{{[0-9]+}}, v{{[0-9]+}} -; GFX11-TRUE16: v_cmp_gt_i16_e32 vcc_lo, v{{[0-9]+}}.l, v{{[0-9]+}}.h +; GFX11-TRUE16: v_cmp_gt_i16_e32 vcc_lo, v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}} define amdgpu_kernel void @i16_sgt(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -154,7 +154,7 @@ entry: ; VI: v_cmp_ge_i16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_ge_i32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} ; GFX11-FAKE16: v_cmp_ge_i16_e32 vcc_lo, v{{[0-9]+}}, v{{[0-9]+}} -; GFX11-TRUE16: v_cmp_ge_i16_e32 vcc_lo, v{{[0-9]+}}.l, v{{[0-9]+}}.h +; GFX11-TRUE16: v_cmp_ge_i16_e32 vcc_lo, v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}} define amdgpu_kernel void @i16_sge(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -174,7 +174,7 @@ entry: ; VI: v_cmp_lt_i16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_lt_i32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} ; GFX11-FAKE16: v_cmp_lt_i16_e32 vcc_lo, v{{[0-9]+}}, v{{[0-9]+}} -; GFX11-TRUE16: v_cmp_lt_i16_e32 vcc_lo, v{{[0-9]+}}.l, v{{[0-9]+}}.h +; GFX11-TRUE16: v_cmp_lt_i16_e32 vcc_lo, v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}} define amdgpu_kernel void @i16_slt(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -194,7 +194,7 @@ entry: ; VI: v_cmp_le_i16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_le_i32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} ; GFX11-FAKE16: v_cmp_le_i16_e32 vcc_lo, v{{[0-9]+}}, v{{[0-9]+}} -; GFX11-TRUE16: v_cmp_le_i16_e32 vcc_lo, v{{[0-9]+}}.l, v{{[0-9]+}}.h +; GFX11-TRUE16: v_cmp_le_i16_e32 vcc_lo, v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}} define amdgpu_kernel void @i16_sle(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -215,7 +215,7 @@ entry: ; VI: v_cmp_eq_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_eq_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; GFX11-FAKE16: v_cmp_eq_u16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}} -; GFX11-TRUE16: v_cmp_eq_u16_e32 vcc_lo, v{{[0-9]+}}.h, v{{[0-9]+}}.l +; GFX11-TRUE16: v_cmp_eq_u16_e32 vcc_lo, v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}} define amdgpu_kernel void @i16_eq_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -233,7 +233,7 @@ entry: ; VI: v_cmp_ne_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_ne_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; GFX11-FAKE16: v_cmp_ne_u16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}} -; GFX11-TRUE16: v_cmp_ne_u16_e32 vcc_lo, v{{[0-9]+}}.h, v{{[0-9]+}}.l +; GFX11-TRUE16: v_cmp_ne_u16_e32 vcc_lo, v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}} define amdgpu_kernel void @i16_ne_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -251,7 +251,7 @@ entry: ; VI: v_cmp_lt_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_lt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; GFX11-FAKE16: v_cmp_lt_u16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}} -; GFX11-TRUE16: v_cmp_gt_u16_e32 vcc_lo, v{{[0-9]+}}.h, v{{[0-9]+}}.l +; GFX11-TRUE16: v_cmp_gt_u16_e32 vcc_lo, v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}} define amdgpu_kernel void @i16_ugt_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -269,7 +269,7 @@ entry: ; VI: v_cmp_le_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_le_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; GFX11-FAKE16: v_cmp_le_u16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}} -; GFX11-TRUE16: v_cmp_ge_u16_e32 vcc_lo, v{{[0-9]+}}.h, v{{[0-9]+}}.l +; GFX11-TRUE16: v_cmp_ge_u16_e32 vcc_lo, v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}} define amdgpu_kernel void @i16_uge_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -287,7 +287,7 @@ entry: ; VI: v_cmp_gt_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_gt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; GFX11-FAKE16: v_cmp_gt_u16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}} -; GFX11-TRUE16: v_cmp_lt_u16_e32 vcc_lo, v{{[0-9]+}}.h, v{{[0-9]+}}.l +; GFX11-TRUE16: v_cmp_lt_u16_e32 vcc_lo, v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}} define amdgpu_kernel void @i16_ult_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -305,7 +305,7 @@ entry: ; VI: v_cmp_ge_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_ge_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; GFX11-FAKE16: v_cmp_ge_u16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}} -; GFX11-TRUE16: v_cmp_le_u16_e32 vcc_lo, v{{[0-9]+}}.h, v{{[0-9]+}}.l +; GFX11-TRUE16: v_cmp_le_u16_e32 vcc_lo, v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}} define amdgpu_kernel void @i16_ule_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -323,7 +323,7 @@ entry: ; VI: v_cmp_lt_i16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_lt_i32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; GFX11-FAKE16: v_cmp_lt_i16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}} -; GFX11-TRUE16: v_cmp_gt_i16_e32 vcc_lo, v{{[0-9]+}}.h, v{{[0-9]+}}.l +; GFX11-TRUE16: v_cmp_gt_i16_e32 vcc_lo, v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}} define amdgpu_kernel void @i16_sgt_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -341,7 +341,7 @@ entry: ; VI: v_cmp_le_i16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_le_i32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; GFX11-FAKE16: v_cmp_le_i16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}} -; GFX11-TRUE16: v_cmp_ge_i16_e32 vcc_lo, v{{[0-9]+}}.h, v{{[0-9]+}}.l +; GFX11-TRUE16: v_cmp_ge_i16_e32 vcc_lo, v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}} define amdgpu_kernel void @i16_sge_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -359,7 +359,7 @@ entry: ; VI: v_cmp_gt_i16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_gt_i32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; GFX11-FAKE16: v_cmp_gt_i16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}} -; GFX11-TRUE16: v_cmp_lt_i16_e32 vcc_lo, v{{[0-9]+}}.h, v{{[0-9]+}}.l +; GFX11-TRUE16: v_cmp_lt_i16_e32 vcc_lo, v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}} define amdgpu_kernel void @i16_slt_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -377,7 +377,7 @@ entry: ; VI: v_cmp_ge_i16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_ge_i32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; GFX11-FAKE16: v_cmp_ge_i16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}} -; GFX11-TRUE16: v_cmp_le_i16_e32 vcc_lo, v{{[0-9]+}}.h, v{{[0-9]+}}.l +; GFX11-TRUE16: v_cmp_le_i16_e32 vcc_lo, v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}} define amdgpu_kernel void @i16_sle_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/mad.u16.ll b/llvm/test/CodeGen/AMDGPU/mad.u16.ll index c4c0dc6998265..64668f006aab4 100644 --- a/llvm/test/CodeGen/AMDGPU/mad.u16.ll +++ b/llvm/test/CodeGen/AMDGPU/mad.u16.ll @@ -71,21 +71,16 @@ define amdgpu_kernel void @mad_u16( ; GFX11-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 1, v0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc +; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] glc dlc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: global_load_u16 v2, v0, s[4:5] glc dlc +; GFX11-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[4:5] glc dlc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: global_load_u16 v3, v0, s[6:7] glc dlc +; GFX11-TRUE16-NEXT: global_load_d16_b16 v1, v1, s[6:7] glc dlc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v1.l -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1] ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: mad_u16: diff --git a/llvm/test/CodeGen/AMDGPU/minimummaximum.ll b/llvm/test/CodeGen/AMDGPU/minimummaximum.ll index 7e867a5372986..a9d760d19ec04 100644 --- a/llvm/test/CodeGen/AMDGPU/minimummaximum.ll +++ b/llvm/test/CodeGen/AMDGPU/minimummaximum.ll @@ -158,16 +158,28 @@ define amdgpu_ps half @test_maxmin_commuted_f16(half %a, half %b, half %c) { } define amdgpu_ps void @s_test_minmax_f16(half inreg %a, half inreg %b, half inreg %c, ptr addrspace(1) inreg %out) { -; SDAG-LABEL: s_test_minmax_f16: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_maximum_f16 s0, s0, s1 -; SDAG-NEXT: s_mov_b32 s5, s4 -; SDAG-NEXT: s_mov_b32 s4, s3 -; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_3) -; SDAG-NEXT: s_minimum_f16 s0, s0, s2 -; SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 -; SDAG-NEXT: global_store_b16 v0, v1, s[4:5] -; SDAG-NEXT: s_endpgm +; SDAG-TRUE16-LABEL: s_test_minmax_f16: +; SDAG-TRUE16: ; %bb.0: +; SDAG-TRUE16-NEXT: s_maximum_f16 s0, s0, s1 +; SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-TRUE16-NEXT: s_mov_b32 s5, s4 +; SDAG-TRUE16-NEXT: s_mov_b32 s4, s3 +; SDAG-TRUE16-NEXT: s_minimum_f16 s0, s0, s2 +; SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; SDAG-TRUE16-NEXT: global_store_b16 v1, v0, s[4:5] +; SDAG-TRUE16-NEXT: s_endpgm +; +; SDAG-FAKE16-LABEL: s_test_minmax_f16: +; SDAG-FAKE16: ; %bb.0: +; SDAG-FAKE16-NEXT: s_maximum_f16 s0, s0, s1 +; SDAG-FAKE16-NEXT: s_mov_b32 s5, s4 +; SDAG-FAKE16-NEXT: s_mov_b32 s4, s3 +; SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_3) +; SDAG-FAKE16-NEXT: s_minimum_f16 s0, s0, s2 +; SDAG-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; SDAG-FAKE16-NEXT: global_store_b16 v0, v1, s[4:5] +; SDAG-FAKE16-NEXT: s_endpgm ; ; GISEL-LABEL: s_test_minmax_f16: ; GISEL: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll index 53f1c476e49ee..0acb4a49dcb61 100644 --- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll @@ -1340,7 +1340,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3] +; GFX11-SDAG-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, 64 ; GFX11-SDAG-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] @@ -1527,7 +1527,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out ; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 1, v1 ; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: global_load_u16 v0, v0, s[2:3] +; GFX11-SDAG-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3] ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, 64 ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1761,21 +1761,16 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out, ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: global_load_u16 v0, v1, s[2:3] glc dlc +; GFX11-SDAG-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] glc dlc ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: global_load_u16 v2, v1, s[2:3] glc dlc +; GFX11-SDAG-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[2:3] glc dlc ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, 64 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l ; GFX11-SDAG-TRUE16-NEXT: v_sub_nc_u16 v0.h, v0.h, 64 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-SDAG-TRUE16-NEXT: global_store_b16 v1, v2, s[0:1] dlc -; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] dlc ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-TRUE16-NEXT: global_store_d16_hi_b16 v1, v0, s[0:1] dlc +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-TRUE16-NEXT: s_endpgm ; ; GFX11-SDAG-FAKE16-LABEL: v_test_i16_x_sub_64_multi_use: diff --git a/llvm/test/CodeGen/AMDGPU/v_pack.ll b/llvm/test/CodeGen/AMDGPU/v_pack.ll index 072151dd6f5a0..5ecfad9160903 100644 --- a/llvm/test/CodeGen/AMDGPU/v_pack.ll +++ b/llvm/test/CodeGen/AMDGPU/v_pack.ll @@ -88,15 +88,12 @@ define amdgpu_kernel void @v_pack_b32_v2f16(ptr addrspace(1) %in0, ptr addrspace ; GFX11-GCN-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GCN-REAL16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GCN-REAL16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GCN-REAL16-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX11-GCN-REAL16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GCN-REAL16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-GCN-REAL16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc ; GFX11-GCN-REAL16-NEXT: s_waitcnt vmcnt(0) -; GFX11-GCN-REAL16-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc +; GFX11-GCN-REAL16-NEXT: global_load_d16_hi_b16 v0, v1, s[2:3] glc dlc ; GFX11-GCN-REAL16-NEXT: s_waitcnt vmcnt(0) -; GFX11-GCN-REAL16-NEXT: v_mov_b16_e32 v0.l, v1.l -; GFX11-GCN-REAL16-NEXT: v_mov_b16_e32 v0.h, v2.l -; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-GCN-REAL16-NEXT: v_add_f16_e32 v0.l, 2.0, v0.l ; GFX11-GCN-REAL16-NEXT: v_add_f16_e32 v0.h, 2.0, v0.h ; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -220,15 +217,12 @@ define amdgpu_kernel void @v_pack_b32_v2f16_sub(ptr addrspace(1) %in0, ptr addrs ; GFX11-GCN-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GCN-REAL16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GCN-REAL16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GCN-REAL16-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX11-GCN-REAL16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GCN-REAL16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-GCN-REAL16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc ; GFX11-GCN-REAL16-NEXT: s_waitcnt vmcnt(0) -; GFX11-GCN-REAL16-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc +; GFX11-GCN-REAL16-NEXT: global_load_d16_hi_b16 v0, v1, s[2:3] glc dlc ; GFX11-GCN-REAL16-NEXT: s_waitcnt vmcnt(0) -; GFX11-GCN-REAL16-NEXT: v_mov_b16_e32 v0.l, v1.l -; GFX11-GCN-REAL16-NEXT: v_mov_b16_e32 v0.h, v2.l -; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-GCN-REAL16-NEXT: v_subrev_f16_e32 v0.l, 2.0, v0.l ; GFX11-GCN-REAL16-NEXT: v_add_f16_e32 v0.h, 2.0, v0.h ; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -466,15 +460,12 @@ define amdgpu_kernel void @v_pack_b32.fabs(ptr addrspace(1) %in0, ptr addrspace( ; GFX11-GCN-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GCN-REAL16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GCN-REAL16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GCN-REAL16-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX11-GCN-REAL16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GCN-REAL16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-GCN-REAL16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc ; GFX11-GCN-REAL16-NEXT: s_waitcnt vmcnt(0) -; GFX11-GCN-REAL16-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc +; GFX11-GCN-REAL16-NEXT: global_load_d16_hi_b16 v0, v1, s[2:3] glc dlc ; GFX11-GCN-REAL16-NEXT: s_waitcnt vmcnt(0) -; GFX11-GCN-REAL16-NEXT: v_mov_b16_e32 v0.l, v1.l -; GFX11-GCN-REAL16-NEXT: v_mov_b16_e32 v0.h, v2.l -; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-GCN-REAL16-NEXT: v_add_f16_e32 v0.l, 2.0, v0.l ; GFX11-GCN-REAL16-NEXT: v_add_f16_e32 v0.h, 2.0, v0.h ; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -600,15 +591,12 @@ define amdgpu_kernel void @v_pack_b32.fneg(ptr addrspace(1) %in0, ptr addrspace( ; GFX11-GCN-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GCN-REAL16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GCN-REAL16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GCN-REAL16-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX11-GCN-REAL16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GCN-REAL16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-GCN-REAL16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc ; GFX11-GCN-REAL16-NEXT: s_waitcnt vmcnt(0) -; GFX11-GCN-REAL16-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc +; GFX11-GCN-REAL16-NEXT: global_load_d16_hi_b16 v0, v1, s[2:3] glc dlc ; GFX11-GCN-REAL16-NEXT: s_waitcnt vmcnt(0) -; GFX11-GCN-REAL16-NEXT: v_mov_b16_e32 v0.l, v1.l -; GFX11-GCN-REAL16-NEXT: v_mov_b16_e32 v0.h, v2.l -; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-GCN-REAL16-NEXT: v_add_f16_e32 v0.l, 2.0, v0.l ; GFX11-GCN-REAL16-NEXT: v_add_f16_e32 v0.h, 2.0, v0.h ; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1)