diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index d3487daee364f..e1e7433b04697 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -127,6 +127,15 @@ multiclass DS_1A1D_NORET_mc { } } +multiclass DS_1A1D_NORET_t16 +: DS_1A1D_NORET_mc { + let has_m0_read = 0 in { + let True16Predicate = UseRealTrue16Insts in { + def "_t16" : DS_1A1D_NORET, True16D16Table; + } + } +} + multiclass DS_1A1D_NORET_mc_gfx9 { let has_m0_read = 0 in { def "" : DS_1A1D_NORET; @@ -294,6 +303,15 @@ multiclass DS_1A_RET_mc +: DS_1A_RET_mc { + let has_m0_read = 0 in { + let True16Predicate = UseRealTrue16Insts in { + def "_t16" : DS_1A_RET, True16D16Table; + } + } +} + multiclass DS_1A_RET_NoM0 { let has_m0_read = 0 in { def "" : DS_1A_RET; @@ -457,8 +475,6 @@ defm DS_MIN_F32 : DS_1A1D_NORET_mc<"ds_min_f32">; defm DS_MAX_F32 : DS_1A1D_NORET_mc<"ds_max_f32">; let mayLoad = 0 in { -defm DS_WRITE_B8 : DS_1A1D_NORET_mc<"ds_write_b8">; -defm DS_WRITE_B16 : DS_1A1D_NORET_mc<"ds_write_b16">; defm DS_WRITE_B32 : DS_1A1D_NORET_mc<"ds_write_b32">; defm DS_WRITE2_B32 : DS_1A2D_Off8_NORET_mc<"ds_write2_b32">; defm DS_WRITE2ST64_B32: DS_1A2D_Off8_NORET_mc<"ds_write2st64_b32">; @@ -473,6 +489,9 @@ def DS_WRITE_B16_D16_HI : DS_1A1D_NORET<"ds_write_b16_d16_hi">; } // End has_m0_read = 0 +defm DS_WRITE_B8 : DS_1A1D_NORET_t16<"ds_write_b8">; +defm DS_WRITE_B16 : DS_1A1D_NORET_t16<"ds_write_b16">; + let SubtargetPredicate = HasDSAddTid in { def DS_WRITE_ADDTID_B32 : DS_0A1D_NORET<"ds_write_addtid_b32">; } @@ -625,10 +644,7 @@ def DS_SWIZZLE_B32 : DS_1A_RET <"ds_swizzle_b32", VGPR_32, 0, Swizzle>; } let mayStore = 0 in { -defm DS_READ_I8 : DS_1A_RET_mc<"ds_read_i8">; -defm DS_READ_U8 : DS_1A_RET_mc<"ds_read_u8">; defm DS_READ_I16 : DS_1A_RET_mc<"ds_read_i16">; -defm DS_READ_U16 : DS_1A_RET_mc<"ds_read_u16">; defm DS_READ_B32 : DS_1A_RET_mc<"ds_read_b32">; defm DS_READ_B64 : DS_1A_RET_mc<"ds_read_b64", VReg_64>; @@ -649,6 +665,10 @@ def DS_READ_U16_D16_HI : DS_1A_RET_Tied<"ds_read_u16_d16_hi">; } } // End has_m0_read = 0 +defm DS_READ_I8 : DS_1A_RET_t16<"ds_read_i8">; +defm DS_READ_U8 : DS_1A_RET_t16<"ds_read_u8">; +defm DS_READ_U16 : DS_1A_RET_t16<"ds_read_u16">; + let SubtargetPredicate = HasDSAddTid in { def DS_READ_ADDTID_B32 : DS_0A_RET<"ds_read_addtid_b32">; } @@ -784,34 +804,51 @@ multiclass DSReadPat_mc { } } +multiclass DSReadPat_t16 { + + let OtherPredicates = [LDSRequiresM0Init] in { + def : DSReadPat(frag#"_m0")>; + } + + let OtherPredicates = [NotLDSRequiresM0Init] in { + foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in + let True16Predicate = p in { + def : DSReadPat(!cast(inst)#"_gfx9"), vt, !cast(frag)>; + } + let True16Predicate = UseRealTrue16Insts in { + def : DSReadPat(!cast(inst)#"_t16"), vt, !cast(frag)>; + } + } +} + class DSReadPat_D16 : GCNPat < (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$in), (inst $ptr, Offset:$offset, (i1 0), $in) >; defm : DSReadPat_mc ; -defm : DSReadPat_mc ; defm : DSReadPat_mc ; defm : DSReadPat_mc ; -defm : DSReadPat_mc ; -defm : DSReadPat_mc ; defm : DSReadPat_mc ; defm : DSReadPat_mc ; defm : DSReadPat_mc ; defm : DSReadPat_mc ; -defm : DSReadPat_mc ; +defm : DSReadPat_t16 ; +defm : DSReadPat_t16 ; +defm : DSReadPat_t16 ; +defm : DSReadPat_t16 ; foreach vt = Reg32Types.types in { defm : DSReadPat_mc ; } -defm : DSReadPat_mc ; +defm : DSReadPat_t16 ; defm : DSReadPat_mc ; -defm : DSReadPat_mc ; +defm : DSReadPat_t16 ; defm : DSReadPat_mc ; -defm : DSReadPat_mc ; +defm : DSReadPat_t16 ; defm : DSReadPat_mc ; -defm : DSReadPat_mc ; +defm : DSReadPat_t16 ; defm : DSReadPat_mc ; defm : DSReadPat_mc ; defm : DSReadPat_mc ; @@ -850,18 +887,34 @@ multiclass DSWritePat_mc { } } +multiclass DSWritePat_t16 { + let OtherPredicates = [LDSRequiresM0Init] in { + def : DSWritePat(frag#"_m0")>; + } + + let OtherPredicates = [NotLDSRequiresM0Init] in { + foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in + let True16Predicate = p in { + def : DSWritePat(!cast(inst)#"_gfx9"), vt, !cast(frag)>; + } + let True16Predicate = UseRealTrue16Insts in { + def : DSWritePat(!cast(inst)#"_t16"), vt, !cast(frag)>; + } + } +} + defm : DSWritePat_mc ; defm : DSWritePat_mc ; -defm : DSWritePat_mc ; -defm : DSWritePat_mc ; +defm : DSWritePat_t16 ; +defm : DSWritePat_t16 ; foreach vt = Reg32Types.types in { defm : DSWritePat_mc ; } -defm : DSWritePat_mc ; +defm : DSWritePat_t16 ; defm : DSWritePat_mc ; -defm : DSWritePat_mc ; +defm : DSWritePat_t16 ; defm : DSWritePat_mc ; defm : DSWritePat_mc ; defm : DSWritePat_mc ; diff --git a/llvm/test/CodeGen/AMDGPU/atomic_load_local.ll b/llvm/test/CodeGen/AMDGPU/atomic_load_local.ll index a3b6c283512f3..7f45b038b6d0d 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_load_local.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_load_local.ll @@ -1,208 +1,493 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CI %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s -; GCN-LABEL: {{^}}atomic_load_monotonic_i8: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_read_u8 v0, v0{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define i8 @atomic_load_monotonic_i8(ptr addrspace(3) %ptr) { +; CI-LABEL: atomic_load_monotonic_i8: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_read_u8 v0, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_monotonic_i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_u8 v0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: atomic_load_monotonic_i8: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_u8_d16 v0, v0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: atomic_load_monotonic_i8: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_u8 v0, v0 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %load = load atomic i8, ptr addrspace(3) %ptr monotonic, align 1 ret i8 %load } -; GCN-LABEL: {{^}}atomic_load_monotonic_i8_offset: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_read_u8 v0, v0 offset:16{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define i8 @atomic_load_monotonic_i8_offset(ptr addrspace(3) %ptr) { +; CI-LABEL: atomic_load_monotonic_i8_offset: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_read_u8 v0, v0 offset:16 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_monotonic_i8_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_u8 v0, v0 offset:16 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: atomic_load_monotonic_i8_offset: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_u8_d16 v0, v0 offset:16 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: atomic_load_monotonic_i8_offset: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_u8 v0, v0 offset:16 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds i8, ptr addrspace(3) %ptr, i8 16 %load = load atomic i8, ptr addrspace(3) %gep monotonic, align 1 ret i8 %load } -; GCN-LABEL: {{^}}atomic_load_monotonic_i16: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_read_u16 v0, v0{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define i16 @atomic_load_monotonic_i16(ptr addrspace(3) %ptr) { +; CI-LABEL: atomic_load_monotonic_i16: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_read_u16 v0, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_monotonic_i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_u16 v0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: atomic_load_monotonic_i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_u16_d16 v0, v0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: atomic_load_monotonic_i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_u16 v0, v0 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %load = load atomic i16, ptr addrspace(3) %ptr monotonic, align 2 ret i16 %load } -; GCN-LABEL: {{^}}atomic_load_monotonic_i16_offset: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_read_u16 v0, v0 offset:32{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define i16 @atomic_load_monotonic_i16_offset(ptr addrspace(3) %ptr) { +; CI-LABEL: atomic_load_monotonic_i16_offset: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_read_u16 v0, v0 offset:32 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_monotonic_i16_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_u16 v0, v0 offset:32 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: atomic_load_monotonic_i16_offset: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_u16_d16 v0, v0 offset:32 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: atomic_load_monotonic_i16_offset: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_u16 v0, v0 offset:32 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds i16, ptr addrspace(3) %ptr, i16 16 %load = load atomic i16, ptr addrspace(3) %gep monotonic, align 2 ret i16 %load } -; GCN-LABEL: {{^}}atomic_load_monotonic_i32: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_read_b32 v0, v0{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define i32 @atomic_load_monotonic_i32(ptr addrspace(3) %ptr) { +; CI-LABEL: atomic_load_monotonic_i32: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_read_b32 v0, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_monotonic_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b32 v0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: atomic_load_monotonic_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: ds_load_b32 v0, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %load = load atomic i32, ptr addrspace(3) %ptr monotonic, align 4 ret i32 %load } -; GCN-LABEL: {{^}}atomic_load_monotonic_i32_offset: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_read_b32 v0, v0 offset:64{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define i32 @atomic_load_monotonic_i32_offset(ptr addrspace(3) %ptr) { +; CI-LABEL: atomic_load_monotonic_i32_offset: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_read_b32 v0, v0 offset:64 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_monotonic_i32_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b32 v0, v0 offset:64 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: atomic_load_monotonic_i32_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: ds_load_b32 v0, v0 offset:64 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds i32, ptr addrspace(3) %ptr, i32 16 %load = load atomic i32, ptr addrspace(3) %gep monotonic, align 4 ret i32 %load } -; GCN-LABEL: {{^}}atomic_load_monotonic_i64: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_read_b64 v[0:1], v0{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define i64 @atomic_load_monotonic_i64(ptr addrspace(3) %ptr) { +; CI-LABEL: atomic_load_monotonic_i64: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_read_b64 v[0:1], v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_monotonic_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b64 v[0:1], v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: atomic_load_monotonic_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: ds_load_b64 v[0:1], v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %load = load atomic i64, ptr addrspace(3) %ptr monotonic, align 8 ret i64 %load } -; GCN-LABEL: {{^}}atomic_load_monotonic_i64_offset: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_read_b64 v[0:1], v0 offset:128{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define i64 @atomic_load_monotonic_i64_offset(ptr addrspace(3) %ptr) { +; CI-LABEL: atomic_load_monotonic_i64_offset: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_read_b64 v[0:1], v0 offset:128 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_monotonic_i64_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b64 v[0:1], v0 offset:128 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: atomic_load_monotonic_i64_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: ds_load_b64 v[0:1], v0 offset:128 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds i64, ptr addrspace(3) %ptr, i32 16 %load = load atomic i64, ptr addrspace(3) %gep monotonic, align 8 ret i64 %load } -; GCN-LABEL: {{^}}atomic_load_monotonic_f32_offset: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_read_b32 v0, v0 offset:64{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define float @atomic_load_monotonic_f32_offset(ptr addrspace(3) %ptr) { +; CI-LABEL: atomic_load_monotonic_f32_offset: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_read_b32 v0, v0 offset:64 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_monotonic_f32_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b32 v0, v0 offset:64 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: atomic_load_monotonic_f32_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: ds_load_b32 v0, v0 offset:64 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds float, ptr addrspace(3) %ptr, i32 16 %load = load atomic float, ptr addrspace(3) %gep monotonic, align 4 ret float %load } -; GCN-LABEL: {{^}}atomic_load_monotonic_f64_offset: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_read_b64 v[0:1], v0 offset:128{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define double @atomic_load_monotonic_f64_offset(ptr addrspace(3) %ptr) { +; CI-LABEL: atomic_load_monotonic_f64_offset: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_read_b64 v[0:1], v0 offset:128 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_monotonic_f64_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b64 v[0:1], v0 offset:128 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: atomic_load_monotonic_f64_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: ds_load_b64 v[0:1], v0 offset:128 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds double, ptr addrspace(3) %ptr, i32 16 %load = load atomic double, ptr addrspace(3) %gep monotonic, align 8 ret double %load } -; GCN-LABEL: {{^}}atomic_load_monotonic_p0i8_offset: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_read_b64 v[0:1], v0 offset:128{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define ptr @atomic_load_monotonic_p0i8_offset(ptr addrspace(3) %ptr) { +; CI-LABEL: atomic_load_monotonic_p0i8_offset: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_read_b64 v[0:1], v0 offset:128 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_monotonic_p0i8_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b64 v[0:1], v0 offset:128 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: atomic_load_monotonic_p0i8_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: ds_load_b64 v[0:1], v0 offset:128 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds ptr, ptr addrspace(3) %ptr, i32 16 %load = load atomic ptr, ptr addrspace(3) %gep monotonic, align 8 ret ptr %load } -; GCN-LABEL: {{^}}atomic_load_monotonic_p3i8_offset: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_read_b32 v0, v0 offset:64{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define ptr addrspace(3) @atomic_load_monotonic_p3i8_offset(ptr addrspace(3) %ptr) { +; CI-LABEL: atomic_load_monotonic_p3i8_offset: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_read_b32 v0, v0 offset:64 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_monotonic_p3i8_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b32 v0, v0 offset:64 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: atomic_load_monotonic_p3i8_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: ds_load_b32 v0, v0 offset:64 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) %ptr, i32 16 %load = load atomic ptr addrspace(3), ptr addrspace(3) %gep monotonic, align 4 ret ptr addrspace(3) %load } -; GCN-LABEL: {{^}}atomic_load_monotonic_f16: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_read_u16 v0, v0{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define i16 @atomic_load_monotonic_f16(ptr addrspace(3) %ptr) { +; CI-LABEL: atomic_load_monotonic_f16: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_read_u16 v0, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_monotonic_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_u16 v0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: atomic_load_monotonic_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_u16_d16 v0, v0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: atomic_load_monotonic_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_u16 v0, v0 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %load = load atomic half, ptr addrspace(3) %ptr monotonic, align 2 %ret = bitcast half %load to i16 ret i16 %ret } -; GCN-LABEL: {{^}}atomic_load_monotonic_f16_offset: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_read_u16 v0, v0 offset:32{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define i16 @atomic_load_monotonic_f16_offset(ptr addrspace(3) %ptr) { +; CI-LABEL: atomic_load_monotonic_f16_offset: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_read_u16 v0, v0 offset:32 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_monotonic_f16_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_u16 v0, v0 offset:32 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: atomic_load_monotonic_f16_offset: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_u16_d16 v0, v0 offset:32 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: atomic_load_monotonic_f16_offset: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_u16 v0, v0 offset:32 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds half, ptr addrspace(3) %ptr, i32 16 %load = load atomic half, ptr addrspace(3) %gep monotonic, align 2 %ret = bitcast half %load to i16 ret i16 %ret } -; GCN-LABEL: {{^}}atomic_load_monotonic_bf16: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_read_u16 v0, v0{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define i16 @atomic_load_monotonic_bf16(ptr addrspace(3) %ptr) { +; CI-LABEL: atomic_load_monotonic_bf16: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_read_u16 v0, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_monotonic_bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_u16 v0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: atomic_load_monotonic_bf16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_u16_d16 v0, v0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: atomic_load_monotonic_bf16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_u16 v0, v0 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %load = load atomic bfloat, ptr addrspace(3) %ptr monotonic, align 2 %ret = bitcast bfloat %load to i16 ret i16 %ret } -; GCN-LABEL: {{^}}atomic_load_monotonic_bf16_offset: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_read_u16 v0, v0 offset:32{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define i16 @atomic_load_monotonic_bf16_offset(ptr addrspace(3) %ptr) { +; CI-LABEL: atomic_load_monotonic_bf16_offset: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_read_u16 v0, v0 offset:32 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_load_monotonic_bf16_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_u16 v0, v0 offset:32 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: atomic_load_monotonic_bf16_offset: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_u16_d16 v0, v0 offset:32 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: atomic_load_monotonic_bf16_offset: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_u16 v0, v0 offset:32 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds bfloat, ptr addrspace(3) %ptr, i32 16 %load = load atomic bfloat, ptr addrspace(3) %gep monotonic, align 2 %ret = bitcast bfloat %load to i16 ret i16 %ret } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/atomic_store_local.ll b/llvm/test/CodeGen/AMDGPU/atomic_store_local.ll index cd1e1fb1add47..9236b4018317a 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_store_local.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_store_local.ll @@ -1,156 +1,470 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CI %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s -; GCN-LABEL: {{^}}atomic_store_monotonic_i8: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_write_b8 v0, v1{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define void @atomic_store_monotonic_i8(ptr addrspace(3) %ptr, i8 %val) { +; CI-LABEL: atomic_store_monotonic_i8: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: v_add_i32_e32 v2, vcc, 2, v1 +; CI-NEXT: ds_write_b8 v0, v1 +; CI-NEXT: ds_write_b8 v0, v2 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_store_monotonic_i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v2, 2, v1 +; GFX9-NEXT: ds_write_b8 v0, v1 +; GFX9-NEXT: ds_write_b8 v0, v2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: atomic_store_monotonic_i8: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v1.l, 2 +; GFX11-TRUE16-NEXT: ds_store_b8 v0, v1 +; GFX11-TRUE16-NEXT: ds_store_b8_d16_hi v0, v1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: atomic_store_monotonic_i8: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v1, 2 +; GFX11-FAKE16-NEXT: ds_store_b8 v0, v1 +; GFX11-FAKE16-NEXT: ds_store_b8 v0, v2 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %val1 = add i8 %val, 2 store atomic i8 %val, ptr addrspace(3) %ptr monotonic, align 1 + store atomic i8 %val1, ptr addrspace(3) %ptr monotonic, align 1 ret void } -; GCN-LABEL: {{^}}atomic_store_monotonic_offset_i8: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_write_b8 v0, v1 offset:16{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define void @atomic_store_monotonic_offset_i8(ptr addrspace(3) %ptr, i8 %val) { - %gep = getelementptr inbounds i8, ptr addrspace(3) %ptr, i8 16 - store atomic i8 %val, ptr addrspace(3) %gep monotonic, align 1 +; CI-LABEL: atomic_store_monotonic_offset_i8: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: v_add_i32_e32 v2, vcc, 2, v1 +; CI-NEXT: ds_write_b8 v0, v1 offset:8 +; CI-NEXT: ds_write_b8 v0, v2 offset:16 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_store_monotonic_offset_i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v2, 2, v1 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:8 +; GFX9-NEXT: ds_write_b8 v0, v2 offset:16 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: atomic_store_monotonic_offset_i8: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v1.l, 2 +; GFX11-TRUE16-NEXT: ds_store_b8 v0, v1 offset:8 +; GFX11-TRUE16-NEXT: ds_store_b8_d16_hi v0, v1 offset:16 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: atomic_store_monotonic_offset_i8: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v1, 2 +; GFX11-FAKE16-NEXT: ds_store_b8 v0, v1 offset:8 +; GFX11-FAKE16-NEXT: ds_store_b8 v0, v2 offset:16 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %val1 = add i8 %val, 2 + %gep_1 = getelementptr inbounds i8, ptr addrspace(3) %ptr, i8 8 + %gep_2 = getelementptr inbounds i8, ptr addrspace(3) %ptr, i8 16 + store atomic i8 %val, ptr addrspace(3) %gep_1 monotonic, align 1 + store atomic i8 %val1, ptr addrspace(3) %gep_2 monotonic, align 1 ret void } -; GCN-LABEL: {{^}}atomic_store_monotonic_i16: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_write_b16 v0, v1{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define void @atomic_store_monotonic_i16(ptr addrspace(3) %ptr, i16 %val) { +; CI-LABEL: atomic_store_monotonic_i16: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: v_add_i32_e32 v2, vcc, 2, v1 +; CI-NEXT: ds_write_b16 v0, v1 +; CI-NEXT: ds_write_b16 v0, v2 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_store_monotonic_i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v2, 2, v1 +; GFX9-NEXT: ds_write_b16 v0, v1 +; GFX9-NEXT: ds_write_b16 v0, v2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: atomic_store_monotonic_i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v1.l, 2 +; GFX11-TRUE16-NEXT: ds_store_b16 v0, v1 +; GFX11-TRUE16-NEXT: ds_store_b16_d16_hi v0, v1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: atomic_store_monotonic_i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v1, 2 +; GFX11-FAKE16-NEXT: ds_store_b16 v0, v1 +; GFX11-FAKE16-NEXT: ds_store_b16 v0, v2 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %val1 = add i16 %val, 2 store atomic i16 %val, ptr addrspace(3) %ptr monotonic, align 2 + store atomic i16 %val1, ptr addrspace(3) %ptr monotonic, align 2 ret void } -; GCN-LABEL: {{^}}atomic_store_monotonic_offset_i16: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_write_b16 v0, v1 offset:32{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define void @atomic_store_monotonic_offset_i16(ptr addrspace(3) %ptr, i16 %val) { +; CI-LABEL: atomic_store_monotonic_offset_i16: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: v_add_i32_e32 v2, vcc, 2, v1 +; CI-NEXT: ds_write_b16 v0, v1 offset:32 +; CI-NEXT: ds_write_b16 v0, v2 offset:32 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_store_monotonic_offset_i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v2, 2, v1 +; GFX9-NEXT: ds_write_b16 v0, v1 offset:32 +; GFX9-NEXT: ds_write_b16 v0, v2 offset:32 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: atomic_store_monotonic_offset_i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v1.l, 2 +; GFX11-TRUE16-NEXT: ds_store_b16 v0, v1 offset:32 +; GFX11-TRUE16-NEXT: ds_store_b16_d16_hi v0, v1 offset:32 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: atomic_store_monotonic_offset_i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v1, 2 +; GFX11-FAKE16-NEXT: ds_store_b16 v0, v1 offset:32 +; GFX11-FAKE16-NEXT: ds_store_b16 v0, v2 offset:32 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %val1 = add i16 %val, 2 %gep = getelementptr inbounds i16, ptr addrspace(3) %ptr, i16 16 store atomic i16 %val, ptr addrspace(3) %gep monotonic, align 2 + store atomic i16 %val1, ptr addrspace(3) %gep monotonic, align 2 ret void } -; GCN-LABEL: {{^}}atomic_store_monotonic_i32: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_write_b32 v0, v1{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define void @atomic_store_monotonic_i32(ptr addrspace(3) %ptr, i32 %val) { +; CI-LABEL: atomic_store_monotonic_i32: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_write_b32 v0, v1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_store_monotonic_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_write_b32 v0, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: atomic_store_monotonic_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: ds_store_b32 v0, v1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] store atomic i32 %val, ptr addrspace(3) %ptr monotonic, align 4 ret void } -; GCN-LABEL: {{^}}atomic_store_monotonic_offset_i32: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_write_b32 v0, v1 offset:64{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define void @atomic_store_monotonic_offset_i32(ptr addrspace(3) %ptr, i32 %val) { +; CI-LABEL: atomic_store_monotonic_offset_i32: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_write_b32 v0, v1 offset:64 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_store_monotonic_offset_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_write_b32 v0, v1 offset:64 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: atomic_store_monotonic_offset_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: ds_store_b32 v0, v1 offset:64 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds i32, ptr addrspace(3) %ptr, i32 16 store atomic i32 %val, ptr addrspace(3) %gep monotonic, align 4 ret void } -; GCN-LABEL: {{^}}atomic_store_monotonic_i64: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_write_b64 v0, v[1:2]{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define void @atomic_store_monotonic_i64(ptr addrspace(3) %ptr, i64 %val) { +; CI-LABEL: atomic_store_monotonic_i64: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_write_b64 v0, v[1:2] +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_store_monotonic_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_write_b64 v0, v[1:2] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: atomic_store_monotonic_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: ds_store_b64 v0, v[1:2] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] store atomic i64 %val, ptr addrspace(3) %ptr monotonic, align 8 ret void } -; GCN-LABEL: {{^}}atomic_store_monotonic_offset_i64: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_write_b64 v0, v[1:2] offset:128{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define void @atomic_store_monotonic_offset_i64(ptr addrspace(3) %ptr, i64 %val) { +; CI-LABEL: atomic_store_monotonic_offset_i64: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_write_b64 v0, v[1:2] offset:128 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_store_monotonic_offset_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_write_b64 v0, v[1:2] offset:128 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: atomic_store_monotonic_offset_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: ds_store_b64 v0, v[1:2] offset:128 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds i64, ptr addrspace(3) %ptr, i64 16 store atomic i64 %val, ptr addrspace(3) %gep monotonic, align 8 ret void } -; GCN-LABEL: {{^}}atomic_store_monotonic_f16: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_write_b16 v0, v1{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define void @atomic_store_monotonic_f16(ptr addrspace(3) %ptr, i16 %arg.val) { +; CI-LABEL: atomic_store_monotonic_f16: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: v_add_i32_e32 v2, vcc, 2, v1 +; CI-NEXT: ds_write_b16 v0, v1 +; CI-NEXT: ds_write_b16 v0, v2 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_store_monotonic_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v2, 2, v1 +; GFX9-NEXT: ds_write_b16 v0, v1 +; GFX9-NEXT: ds_write_b16 v0, v2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: atomic_store_monotonic_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v1.l, 2 +; GFX11-TRUE16-NEXT: ds_store_b16 v0, v1 +; GFX11-TRUE16-NEXT: ds_store_b16_d16_hi v0, v1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: atomic_store_monotonic_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v1, 2 +; GFX11-FAKE16-NEXT: ds_store_b16 v0, v1 +; GFX11-FAKE16-NEXT: ds_store_b16 v0, v2 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %arg.val1 = add i16 %arg.val, 2 %val = bitcast i16 %arg.val to half + %val1 = bitcast i16 %arg.val1 to half store atomic half %val, ptr addrspace(3) %ptr monotonic, align 2 + store atomic half %val1, ptr addrspace(3) %ptr monotonic, align 2 ret void } -; GCN-LABEL: {{^}}atomic_store_monotonic_offset_f16: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_write_b16 v0, v1 offset:32{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define void @atomic_store_monotonic_offset_f16(ptr addrspace(3) %ptr, i16 %arg.val) { +; CI-LABEL: atomic_store_monotonic_offset_f16: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: v_add_i32_e32 v2, vcc, 2, v1 +; CI-NEXT: ds_write_b16 v0, v1 offset:32 +; CI-NEXT: ds_write_b16 v0, v2 offset:32 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_store_monotonic_offset_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v2, 2, v1 +; GFX9-NEXT: ds_write_b16 v0, v1 offset:32 +; GFX9-NEXT: ds_write_b16 v0, v2 offset:32 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: atomic_store_monotonic_offset_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v1.l, 2 +; GFX11-TRUE16-NEXT: ds_store_b16 v0, v1 offset:32 +; GFX11-TRUE16-NEXT: ds_store_b16_d16_hi v0, v1 offset:32 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: atomic_store_monotonic_offset_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v1, 2 +; GFX11-FAKE16-NEXT: ds_store_b16 v0, v1 offset:32 +; GFX11-FAKE16-NEXT: ds_store_b16 v0, v2 offset:32 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %arg.val1 = add i16 %arg.val, 2 + %val1 = bitcast i16 %arg.val1 to half %val = bitcast i16 %arg.val to half %gep = getelementptr inbounds half, ptr addrspace(3) %ptr, i32 16 store atomic half %val, ptr addrspace(3) %gep monotonic, align 2 + store atomic half %val1, ptr addrspace(3) %gep monotonic, align 2 ret void } -; GCN-LABEL: {{^}}atomic_store_monotonic_bf16: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_write_b16 v0, v1{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define void @atomic_store_monotonic_bf16(ptr addrspace(3) %ptr, i16 %arg.val) { +; CI-LABEL: atomic_store_monotonic_bf16: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: v_add_i32_e32 v2, vcc, 2, v1 +; CI-NEXT: ds_write_b16 v0, v1 +; CI-NEXT: ds_write_b16 v0, v2 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_store_monotonic_bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v2, 2, v1 +; GFX9-NEXT: ds_write_b16 v0, v1 +; GFX9-NEXT: ds_write_b16 v0, v2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: atomic_store_monotonic_bf16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v1.l, 2 +; GFX11-TRUE16-NEXT: ds_store_b16 v0, v1 +; GFX11-TRUE16-NEXT: ds_store_b16_d16_hi v0, v1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: atomic_store_monotonic_bf16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v1, 2 +; GFX11-FAKE16-NEXT: ds_store_b16 v0, v1 +; GFX11-FAKE16-NEXT: ds_store_b16 v0, v2 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %arg.val1 = add i16 %arg.val, 2 + %val1 = bitcast i16 %arg.val1 to bfloat %val = bitcast i16 %arg.val to bfloat store atomic bfloat %val, ptr addrspace(3) %ptr monotonic, align 2 + store atomic bfloat %val1, ptr addrspace(3) %ptr monotonic, align 2 ret void } -; GCN-LABEL: {{^}}atomic_store_monotonic_offset_bf16: -; GCN: s_waitcnt -; GFX9-NOT: s_mov_b32 m0 -; CI-NEXT: s_mov_b32 m0 -; GCN-NEXT: ds_write_b16 v0, v1 offset:32{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 define void @atomic_store_monotonic_offset_bf16(ptr addrspace(3) %ptr, i16 %arg.val) { +; CI-LABEL: atomic_store_monotonic_offset_bf16: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: v_add_i32_e32 v2, vcc, 2, v1 +; CI-NEXT: ds_write_b16 v0, v1 offset:32 +; CI-NEXT: ds_write_b16 v0, v2 offset:32 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: atomic_store_monotonic_offset_bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v2, 2, v1 +; GFX9-NEXT: ds_write_b16 v0, v1 offset:32 +; GFX9-NEXT: ds_write_b16 v0, v2 offset:32 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: atomic_store_monotonic_offset_bf16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v1.l, 2 +; GFX11-TRUE16-NEXT: ds_store_b16 v0, v1 offset:32 +; GFX11-TRUE16-NEXT: ds_store_b16_d16_hi v0, v1 offset:32 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: atomic_store_monotonic_offset_bf16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_nc_u16 v2, v1, 2 +; GFX11-FAKE16-NEXT: ds_store_b16 v0, v1 offset:32 +; GFX11-FAKE16-NEXT: ds_store_b16 v0, v2 offset:32 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %arg.val1 = add i16 %arg.val, 2 + %val1 = bitcast i16 %arg.val1 to bfloat %val = bitcast i16 %arg.val to bfloat %gep = getelementptr inbounds bfloat, ptr addrspace(3) %ptr, i32 16 store atomic bfloat %val, ptr addrspace(3) %gep monotonic, align 2 + store atomic bfloat %val1, ptr addrspace(3) %gep monotonic, align 2 ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll index 806fe899a9149..c739ba2183ef9 100644 --- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -224,15 +224,25 @@ define <2 x half> @chain_hi_to_lo_group() { ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: chain_hi_to_lo_group: -; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: ds_load_u16 v0, v1 offset:2 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: ds_load_u16_d16_hi v0, v1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: chain_hi_to_lo_group: +; GFX11-TRUE16: ; %bb.0: ; %bb +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: ds_load_u16_d16 v0, v1 offset:2 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_u16_d16_hi v0, v1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: chain_hi_to_lo_group: +; GFX11-FAKE16: ; %bb.0: ; %bb +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-FAKE16-NEXT: ds_load_u16 v0, v1 offset:2 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_u16_d16_hi v0, v1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] bb: %gep_lo = getelementptr inbounds half, ptr addrspace(3) null, i64 1 %load_lo = load half, ptr addrspace(3) %gep_lo @@ -263,14 +273,23 @@ define <2 x half> @chain_hi_to_lo_group_different_bases(ptr addrspace(3) %base_l ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: chain_hi_to_lo_group_different_bases: -; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_u16 v0, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: ds_load_u16_d16_hi v0, v1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: chain_hi_to_lo_group_different_bases: +; GFX11-TRUE16: ; %bb.0: ; %bb +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_u16_d16 v0, v0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_u16_d16_hi v0, v1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: chain_hi_to_lo_group_different_bases: +; GFX11-FAKE16: ; %bb.0: ; %bb +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_u16 v0, v0 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_u16_d16_hi v0, v1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] bb: %load_lo = load half, ptr addrspace(3) %base_lo %load_hi = load half, ptr addrspace(3) %base_hi @@ -780,16 +799,27 @@ define <2 x i16> @chain_hi_to_lo_group_other_dep_multi_chain(ptr addrspace(3) %p ; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: chain_hi_to_lo_group_other_dep_multi_chain: -; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: ds_load_u16 v1, v0 offset:2 -; GFX11-NEXT: ds_load_u16_d16_hi v0, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: chain_hi_to_lo_group_other_dep_multi_chain: +; GFX11-TRUE16: ; %bb.0: ; %bb +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: ds_load_u16_d16 v1, v0 offset:2 +; GFX11-TRUE16-NEXT: ds_load_u16_d16_hi v0, v0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: chain_hi_to_lo_group_other_dep_multi_chain: +; GFX11-FAKE16: ; %bb.0: ; %bb +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: ds_load_u16 v1, v0 offset:2 +; GFX11-FAKE16-NEXT: ds_load_u16_d16_hi v0, v0 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] bb: %gep_lo = getelementptr inbounds i16, ptr addrspace(3) %ptr, i64 1 %load_lo = load volatile i16, ptr addrspace(3) %gep_lo @@ -1047,12 +1077,12 @@ define <2 x i16> @chain_hi_to_lo_group_may_alias_store(ptr addrspace(3) %ptr, pt ; GFX11-TRUE16-LABEL: chain_hi_to_lo_group_may_alias_store: ; GFX11-TRUE16: ; %bb.0: ; %bb ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0x7b -; GFX11-TRUE16-NEXT: ds_load_u16 v3, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0x7b +; GFX11-TRUE16-NEXT: ds_load_u16_d16_hi v2, v0 ; GFX11-TRUE16-NEXT: ds_store_b16 v1, v2 -; GFX11-TRUE16-NEXT: ds_load_u16 v0, v0 offset:2 +; GFX11-TRUE16-NEXT: ds_load_u16_d16 v2, v0 offset:2 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v3.l +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: chain_hi_to_lo_group_may_alias_store: diff --git a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll index 8f702da64c508..7819da8b97e55 100644 --- a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll @@ -2,7 +2,8 @@ ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=CI %s ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s -; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-FAKE16 %s declare i32 @llvm.amdgcn.workitem.id.x() #0 @@ -227,13 +228,22 @@ define amdgpu_kernel void @add_x_shl_max_offset() #1 { ; GFX10-NEXT: ds_write_b8 v0, v1 offset:65535 ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: add_x_shl_max_offset: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX11-NEXT: ds_store_b8 v0, v1 offset:65535 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: add_x_shl_max_offset: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 4, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 13 +; GFX11-TRUE16-NEXT: ds_store_b8 v1, v0 offset:65535 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: add_x_shl_max_offset: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX11-FAKE16-NEXT: ds_store_b8 v0, v1 offset:65535 +; GFX11-FAKE16-NEXT: s_endpgm %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() %shl = shl i32 %x.i, 4 %add = add i32 %shl, 65535 @@ -271,14 +281,24 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_alt() #1 { ; GFX10-NEXT: ds_write_b8 v0, v1 ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: add_x_shl_neg_to_sub_max_offset_alt: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: v_xor_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: ds_store_b8 v0, v1 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: add_x_shl_neg_to_sub_max_offset_alt: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-TRUE16-NEXT: v_xor_b32_e32 v1, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 13 +; GFX11-TRUE16-NEXT: ds_store_b8 v1, v0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: add_x_shl_neg_to_sub_max_offset_alt: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-FAKE16-NEXT: v_xor_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: ds_store_b8 v0, v1 +; GFX11-FAKE16-NEXT: s_endpgm %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() %.neg = mul i32 %x.i, -4 %add = add i32 %.neg, 65535 @@ -316,14 +336,24 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_not_canonical() #1 { ; GFX10-NEXT: ds_write_b8 v0, v1 ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: add_x_shl_neg_to_sub_max_offset_not_canonical: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: v_xor_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: ds_store_b8 v0, v1 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: add_x_shl_neg_to_sub_max_offset_not_canonical: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-TRUE16-NEXT: v_xor_b32_e32 v1, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 13 +; GFX11-TRUE16-NEXT: ds_store_b8 v1, v0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: add_x_shl_neg_to_sub_max_offset_not_canonical: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-FAKE16-NEXT: v_xor_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: ds_store_b8 v0, v1 +; GFX11-FAKE16-NEXT: s_endpgm %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 %neg = sub i32 0, %x.i %shl = shl i32 %neg, 2 @@ -359,14 +389,24 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_p1() #1 { ; GFX10-NEXT: ds_write_b8 v0, v1 ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: add_x_shl_neg_to_sub_max_offset_p1: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0x10000, v0 -; GFX11-NEXT: ds_store_b8 v0, v1 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: add_x_shl_neg_to_sub_max_offset_p1: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v1, 0x10000, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 13 +; GFX11-TRUE16-NEXT: ds_store_b8 v1, v0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: add_x_shl_neg_to_sub_max_offset_p1: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v0, 0x10000, v0 +; GFX11-FAKE16-NEXT: ds_store_b8 v0, v1 +; GFX11-FAKE16-NEXT: s_endpgm %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 %neg = sub i32 0, %x.i %shl = shl i32 %neg, 2 diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll index dcb1d0e8c20a1..027576630c877 100644 --- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll +++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll @@ -17,11 +17,15 @@ ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s ; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s -; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s -; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX1200,GFX1200-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX1200,GFX1200-GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX1200,GFX1200-SDAG,GFX1200-SDAG-TRUE16 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1200,GFX1200-SDAG,GFX1200-SDAG-FAKE16 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX1200,GFX1200-GISEL,GFX1200-GISEL-TRUE16 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1200,GFX1200-GISEL,GFX1200-GISEL-FAKE16 %s ; Test for integer mad formation for patterns used in clpeak @@ -324,71 +328,137 @@ define signext i16 @clpeak_imad_pat_i16(i16 signext %x, i16 signext %y) { ; GFX10-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: clpeak_imad_pat_i16: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mad_u16 v0, v0, v1, v0 -; GFX11-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: clpeak_imad_pat_i16: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX11-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 -; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1200-SDAG-LABEL: clpeak_imad_pat_i16: -; GFX1200-SDAG: ; %bb.0: ; %entry -; GFX1200-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_expcnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_samplecnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_bvhcnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1200-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX1200-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-SDAG-NEXT: v_mad_u16 v0, v0, v1, v0 -; GFX1200-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX1200-GISEL-LABEL: clpeak_imad_pat_i16: -; GFX1200-GISEL: ; %bb.0: ; %entry -; GFX1200-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_expcnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_samplecnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_bvhcnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 -; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX1200-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX11-SDAG-TRUE16-LABEL: clpeak_imad_pat_i16: +; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v1.l, v0.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v1.l, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: clpeak_imad_pat_i16: +; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0 +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v0 +; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-TRUE16-LABEL: clpeak_imad_pat_i16: +; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, 1 +; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, 1 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1 +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-FAKE16-LABEL: clpeak_imad_pat_i16: +; GFX11-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX11-GISEL-FAKE16-NEXT: v_add_nc_u16 v2, v1, 1 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v1, v2, v3, 1 +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-SDAG-TRUE16-LABEL: clpeak_imad_pat_i16: +; GFX1200-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX1200-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v1.l, v0.l, v1.l +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v1.l, v0.l +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v0.l +; GFX1200-SDAG-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-SDAG-FAKE16-LABEL: clpeak_imad_pat_i16: +; GFX1200-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX1200-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX1200-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-GISEL-TRUE16-LABEL: clpeak_imad_pat_i16: +; GFX1200-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX1200-GISEL-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, 1 +; GFX1200-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, 1 +; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1 +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1 +; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h +; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-GISEL-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX1200-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-GISEL-FAKE16-LABEL: clpeak_imad_pat_i16: +; GFX1200-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1200-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX1200-GISEL-FAKE16-NEXT: v_add_nc_u16 v2, v1, 1 +; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v1, v2, v3, 1 +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-GISEL-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX1200-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %conv33 = add i16 %x, 1 %add = mul i16 %conv33, %y @@ -1461,71 +1531,137 @@ define zeroext i16 @clpeak_umad_pat_i16(i16 zeroext %x, i16 zeroext %y) { ; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: clpeak_umad_pat_i16: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mad_u16 v0, v0, v1, v0 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: clpeak_umad_pat_i16: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX11-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 -; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1200-SDAG-LABEL: clpeak_umad_pat_i16: -; GFX1200-SDAG: ; %bb.0: ; %entry -; GFX1200-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_expcnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_samplecnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_bvhcnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1200-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX1200-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-SDAG-NEXT: v_mad_u16 v0, v0, v1, v0 -; GFX1200-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX1200-GISEL-LABEL: clpeak_umad_pat_i16: -; GFX1200-GISEL: ; %bb.0: ; %entry -; GFX1200-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_expcnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_samplecnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_bvhcnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 -; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX1200-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX11-SDAG-TRUE16-LABEL: clpeak_umad_pat_i16: +; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v1.l, v0.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v1.l, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: clpeak_umad_pat_i16: +; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0 +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v0 +; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-TRUE16-LABEL: clpeak_umad_pat_i16: +; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, 1 +; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, 1 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1 +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-FAKE16-LABEL: clpeak_umad_pat_i16: +; GFX11-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX11-GISEL-FAKE16-NEXT: v_add_nc_u16 v2, v1, 1 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v1, v2, v3, 1 +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-SDAG-TRUE16-LABEL: clpeak_umad_pat_i16: +; GFX1200-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX1200-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v1.l, v0.l, v1.l +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v1.l, v0.l +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v0.l +; GFX1200-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-SDAG-FAKE16-LABEL: clpeak_umad_pat_i16: +; GFX1200-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX1200-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1200-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-GISEL-TRUE16-LABEL: clpeak_umad_pat_i16: +; GFX1200-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX1200-GISEL-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, 1 +; GFX1200-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, 1 +; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1 +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1 +; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h +; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1200-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-GISEL-FAKE16-LABEL: clpeak_umad_pat_i16: +; GFX1200-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1200-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX1200-GISEL-FAKE16-NEXT: v_add_nc_u16 v2, v1, 1 +; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v1, v2, v3, 1 +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1200-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %conv33 = add i16 %x, 1 %add = mul i16 %conv33, %y @@ -4315,71 +4451,137 @@ define signext i8 @clpeak_imad_pat_i8(i8 signext %x, i8 signext %y) { ; GFX10-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: clpeak_imad_pat_i8: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mad_u16 v0, v0, v1, v0 -; GFX11-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: clpeak_imad_pat_i8: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX11-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 -; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1200-SDAG-LABEL: clpeak_imad_pat_i8: -; GFX1200-SDAG: ; %bb.0: ; %entry -; GFX1200-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_expcnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_samplecnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_bvhcnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1200-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX1200-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-SDAG-NEXT: v_mad_u16 v0, v0, v1, v0 -; GFX1200-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX1200-GISEL-LABEL: clpeak_imad_pat_i8: -; GFX1200-GISEL: ; %bb.0: ; %entry -; GFX1200-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_expcnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_samplecnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_bvhcnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 -; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX1200-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX11-SDAG-TRUE16-LABEL: clpeak_imad_pat_i8: +; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v1.l, v0.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v1.l, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: clpeak_imad_pat_i8: +; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0 +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v0 +; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-TRUE16-LABEL: clpeak_imad_pat_i8: +; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, 1 +; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, 1 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1 +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-FAKE16-LABEL: clpeak_imad_pat_i8: +; GFX11-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX11-GISEL-FAKE16-NEXT: v_add_nc_u16 v2, v1, 1 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v1, v2, v3, 1 +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-SDAG-TRUE16-LABEL: clpeak_imad_pat_i8: +; GFX1200-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX1200-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v1.l, v0.l, v1.l +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v1.l, v0.l +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v0.l +; GFX1200-SDAG-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-SDAG-FAKE16-LABEL: clpeak_imad_pat_i8: +; GFX1200-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX1200-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX1200-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-GISEL-TRUE16-LABEL: clpeak_imad_pat_i8: +; GFX1200-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX1200-GISEL-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, 1 +; GFX1200-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, 1 +; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1 +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1 +; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h +; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-GISEL-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX1200-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-GISEL-FAKE16-LABEL: clpeak_imad_pat_i8: +; GFX1200-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1200-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX1200-GISEL-FAKE16-NEXT: v_add_nc_u16 v2, v1, 1 +; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v1, v2, v3, 1 +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-GISEL-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX1200-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %conv33 = add i8 %x, 1 %add = mul i8 %conv33, %y @@ -4584,113 +4786,221 @@ define <2 x i8> @clpeak_imad_pat_v2i8(<2 x i8> %x, <2 x i8> %y) { ; GFX10-GISEL-NEXT: v_mul_lo_u16 v1, v1, v3 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: clpeak_imad_pat_v2i8: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_add_nc_u16 v1, v1, 1 -; GFX11-SDAG-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_mad_u16 v4, v1, v3, v1 -; GFX11-SDAG-NEXT: v_mad_u16 v5, v0, v2, v0 -; GFX11-SDAG-NEXT: v_mul_lo_u16 v1, v1, v3 -; GFX11-SDAG-NEXT: v_mul_lo_u16 v0, v0, v2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_mul_lo_u16 v3, v4, v3 -; GFX11-SDAG-NEXT: v_mul_lo_u16 v2, v5, v2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_mad_u16 v1, v3, v1, v3 -; GFX11-SDAG-NEXT: v_mad_u16 v0, v2, v0, v2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_mad_u16 v1, v1, v3, v1 -; GFX11-SDAG-NEXT: v_mad_u16 v0, v0, v2, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_lshlrev_b16 v2, 8, v1 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: clpeak_imad_pat_v2i8: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX11-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mad_u16 v4, v0, v2, v0 -; GFX11-GISEL-NEXT: v_mad_u16 v5, v1, v3, v1 -; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v2, 1 -; GFX11-GISEL-NEXT: v_mad_u16 v1, v1, v3, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v6, v4, v2 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v7, v5, v3 -; GFX11-GISEL-NEXT: v_mad_u16 v2, v4, v2, 1 -; GFX11-GISEL-NEXT: v_mad_u16 v3, v5, v3, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v6, v0 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v7, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v2 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v1, v3 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1200-SDAG-LABEL: clpeak_imad_pat_v2i8: -; GFX1200-SDAG: ; %bb.0: ; %entry -; GFX1200-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_expcnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_samplecnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_bvhcnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1200-SDAG-NEXT: v_add_nc_u16 v1, v1, 1 -; GFX1200-SDAG-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1200-SDAG-NEXT: v_mad_u16 v4, v1, v3, v1 -; GFX1200-SDAG-NEXT: v_mad_u16 v5, v0, v2, v0 -; GFX1200-SDAG-NEXT: v_mul_lo_u16 v1, v1, v3 -; GFX1200-SDAG-NEXT: v_mul_lo_u16 v0, v0, v2 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1200-SDAG-NEXT: v_mul_lo_u16 v3, v4, v3 -; GFX1200-SDAG-NEXT: v_mul_lo_u16 v2, v5, v2 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1200-SDAG-NEXT: v_mad_u16 v1, v3, v1, v3 -; GFX1200-SDAG-NEXT: v_mad_u16 v0, v2, v0, v2 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1200-SDAG-NEXT: v_mad_u16 v1, v1, v3, v1 -; GFX1200-SDAG-NEXT: v_mad_u16 v0, v0, v2, v0 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1200-SDAG-NEXT: v_lshlrev_b16 v2, 8, v1 -; GFX1200-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX1200-SDAG-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1200-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX1200-GISEL-LABEL: clpeak_imad_pat_v2i8: -; GFX1200-GISEL: ; %bb.0: ; %entry -; GFX1200-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_expcnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_samplecnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_bvhcnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1200-GISEL-NEXT: v_mad_u16 v4, v0, v2, v0 -; GFX1200-GISEL-NEXT: v_mad_u16 v5, v1, v3, v1 -; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v2, 1 -; GFX1200-GISEL-NEXT: v_mad_u16 v1, v1, v3, 1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v6, v4, v2 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v7, v5, v3 -; GFX1200-GISEL-NEXT: v_mad_u16 v2, v4, v2, 1 -; GFX1200-GISEL-NEXT: v_mad_u16 v3, v5, v3, 1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v6, v0 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v7, v1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v2 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v1, v3 -; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX11-SDAG-TRUE16-LABEL: clpeak_imad_pat_v2i8: +; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, 1 +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, 1 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v1.l, v0.h, v3.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v1.h, v0.l, v2.l, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_mul_lo_u16 v0.h, v0.h, v3.l +; GFX11-SDAG-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v2.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SDAG-TRUE16-NEXT: v_mul_lo_u16 v1.l, v1.l, v3.l +; GFX11-SDAG-TRUE16-NEXT: v_mul_lo_u16 v1.h, v1.h, v2.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v1.l, v0.h, v1.l +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v1.h, v0.l, v1.h +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.h, v0.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v0.h +; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v0.h +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: clpeak_imad_pat_v2i8: +; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_add_nc_u16 v1, v1, 1 +; GFX11-SDAG-FAKE16-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v4, v1, v3, v1 +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v5, v0, v2, v0 +; GFX11-SDAG-FAKE16-NEXT: v_mul_lo_u16 v1, v1, v3 +; GFX11-SDAG-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v2 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SDAG-FAKE16-NEXT: v_mul_lo_u16 v3, v4, v3 +; GFX11-SDAG-FAKE16-NEXT: v_mul_lo_u16 v2, v5, v2 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v1, v3, v1, v3 +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v2, v0, v2 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v1, v1, v3, v1 +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v2, v0 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-FAKE16-NEXT: v_lshlrev_b16 v2, 8, v1 +; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-TRUE16-LABEL: clpeak_imad_pat_v2i8: +; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, 1 +; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, 1 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v1.l, v0.l, v2.l, v0.l +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v1.h, v0.h, v3.l, v0.h +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v2.l, 1 +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v3.l, 1 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v2.h, v1.l, v2.l +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v3.h, v1.h, v3.l +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v1.l, v1.l, v2.l, 1 +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v1.h, v1.h, v3.l, 1 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v2.h, v0.l +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.h, v3.h, v0.h +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h +; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-FAKE16-LABEL: clpeak_imad_pat_v2i8: +; GFX11-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX11-GISEL-FAKE16-NEXT: v_add_nc_u16 v1, v1, 1 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v4, v0, v2, v0 +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v5, v1, v3, v1 +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v2, 1 +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v1, v1, v3, 1 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v6, v4, v2 +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v7, v5, v3 +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v2, v4, v2, 1 +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v3, v5, v3, 1 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v6, v0 +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v7, v1 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v2 +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v1, v3 +; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-SDAG-TRUE16-LABEL: clpeak_imad_pat_v2i8: +; GFX1200-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX1200-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, 1 +; GFX1200-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, 1 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v1.l, v0.h, v3.l, v0.h +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v1.h, v0.l, v2.l, v0.l +; GFX1200-SDAG-TRUE16-NEXT: v_mul_lo_u16 v0.h, v0.h, v3.l +; GFX1200-SDAG-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v2.l +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1200-SDAG-TRUE16-NEXT: v_mul_lo_u16 v1.l, v1.l, v3.l +; GFX1200-SDAG-TRUE16-NEXT: v_mul_lo_u16 v1.h, v1.h, v2.l +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v1.l, v0.h, v1.l +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v1.h, v0.l, v1.h +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.l, v0.h +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.h, v0.l +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1200-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v0.h +; GFX1200-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l +; GFX1200-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v0.h +; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-SDAG-FAKE16-LABEL: clpeak_imad_pat_v2i8: +; GFX1200-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX1200-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: v_add_nc_u16 v1, v1, 1 +; GFX1200-SDAG-FAKE16-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v4, v1, v3, v1 +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v5, v0, v2, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_mul_lo_u16 v1, v1, v3 +; GFX1200-SDAG-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v2 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1200-SDAG-FAKE16-NEXT: v_mul_lo_u16 v3, v4, v3 +; GFX1200-SDAG-FAKE16-NEXT: v_mul_lo_u16 v2, v5, v2 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v1, v3, v1, v3 +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v2, v0, v2 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v1, v1, v3, v1 +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v2, v0 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1200-SDAG-FAKE16-NEXT: v_lshlrev_b16 v2, 8, v1 +; GFX1200-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1200-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX1200-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-GISEL-TRUE16-LABEL: clpeak_imad_pat_v2i8: +; GFX1200-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX1200-GISEL-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, 1 +; GFX1200-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, 1 +; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v1.l, v0.l, v2.l, v0.l +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v1.h, v0.h, v3.l, v0.h +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v2.l, 1 +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v3.l, 1 +; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v2.h, v1.l, v2.l +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v3.h, v1.h, v3.l +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v1.l, v1.l, v2.l, 1 +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v1.h, v1.h, v3.l, 1 +; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v2.h, v0.l +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.h, v3.h, v0.h +; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h +; GFX1200-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-GISEL-FAKE16-LABEL: clpeak_imad_pat_v2i8: +; GFX1200-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1200-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX1200-GISEL-FAKE16-NEXT: v_add_nc_u16 v1, v1, 1 +; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v4, v0, v2, v0 +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v5, v1, v3, v1 +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v2, 1 +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v1, v1, v3, 1 +; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v6, v4, v2 +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v7, v5, v3 +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v2, v4, v2, 1 +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v3, v5, v3, 1 +; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v6, v0 +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v7, v1 +; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v2 +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v1, v3 +; GFX1200-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %y18 = add <2 x i8> %x, %add = mul <2 x i8> %y18, %y @@ -7656,103 +7966,201 @@ define signext i16 @clpeak_imad_pat_i16_x2(i16 signext %x, i16 signext %y) { ; GFX10-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: clpeak_imad_pat_i16_x2: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mad_u16 v0, v0, v1, v0 -; GFX11-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: clpeak_imad_pat_i16_x2: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX11-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 -; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 -; GFX11-GISEL-NEXT: v_mad_u16 v2, v2, v3, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 -; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 -; GFX11-GISEL-NEXT: v_mad_u16 v2, v2, v3, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 -; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1200-SDAG-LABEL: clpeak_imad_pat_i16_x2: -; GFX1200-SDAG: ; %bb.0: ; %entry -; GFX1200-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_expcnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_samplecnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_bvhcnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1200-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX1200-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX1200-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX1200-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-SDAG-NEXT: v_mad_u16 v0, v0, v1, v0 -; GFX1200-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX1200-GISEL-LABEL: clpeak_imad_pat_i16_x2: -; GFX1200-GISEL: ; %bb.0: ; %entry -; GFX1200-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_expcnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_samplecnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_bvhcnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 -; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 -; GFX1200-GISEL-NEXT: v_mad_u16 v2, v2, v3, 1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 -; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 -; GFX1200-GISEL-NEXT: v_mad_u16 v2, v2, v3, 1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 -; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX1200-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX11-SDAG-TRUE16-LABEL: clpeak_imad_pat_i16_x2: +; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v1.l, v0.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v1.l, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v0.h, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v0.h, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: clpeak_imad_pat_i16_x2: +; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0 +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0 +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0 +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v0 +; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-TRUE16-LABEL: clpeak_imad_pat_i16_x2: +; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, 1 +; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, 1 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1 +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1 +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1 +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-FAKE16-LABEL: clpeak_imad_pat_i16_x2: +; GFX11-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX11-GISEL-FAKE16-NEXT: v_add_nc_u16 v2, v1, 1 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v2, v2, v3, 1 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v2, v2, v3, 1 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v1, v2, v3, 1 +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-SDAG-TRUE16-LABEL: clpeak_imad_pat_i16_x2: +; GFX1200-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX1200-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v1.l, v0.l, v1.l +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v1.l, v0.l +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v0.h, v0.l +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v0.h, v0.l +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v0.l +; GFX1200-SDAG-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-SDAG-FAKE16-LABEL: clpeak_imad_pat_i16_x2: +; GFX1200-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX1200-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX1200-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-GISEL-TRUE16-LABEL: clpeak_imad_pat_i16_x2: +; GFX1200-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX1200-GISEL-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, 1 +; GFX1200-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, 1 +; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1 +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1 +; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1 +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1 +; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1 +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1 +; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h +; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-GISEL-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX1200-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-GISEL-FAKE16-LABEL: clpeak_imad_pat_i16_x2: +; GFX1200-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1200-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX1200-GISEL-FAKE16-NEXT: v_add_nc_u16 v2, v1, 1 +; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v2, v2, v3, 1 +; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v2, v2, v3, 1 +; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v1, v2, v3, 1 +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-GISEL-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX1200-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %conv69 = add i16 %x, 1 %add = mul i16 %conv69, %y @@ -7915,103 +8323,201 @@ define zeroext i16 @clpeak_umad_pat_i16_x2(i16 zeroext %x, i16 zeroext %y) { ; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: clpeak_umad_pat_i16_x2: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mad_u16 v0, v0, v1, v0 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: clpeak_umad_pat_i16_x2: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX11-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 -; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 -; GFX11-GISEL-NEXT: v_mad_u16 v2, v2, v3, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 -; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 -; GFX11-GISEL-NEXT: v_mad_u16 v2, v2, v3, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 -; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1200-SDAG-LABEL: clpeak_umad_pat_i16_x2: -; GFX1200-SDAG: ; %bb.0: ; %entry -; GFX1200-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_expcnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_samplecnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_bvhcnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1200-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX1200-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX1200-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX1200-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-SDAG-NEXT: v_mad_u16 v0, v0, v1, v0 -; GFX1200-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX1200-GISEL-LABEL: clpeak_umad_pat_i16_x2: -; GFX1200-GISEL: ; %bb.0: ; %entry -; GFX1200-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_expcnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_samplecnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_bvhcnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 -; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 -; GFX1200-GISEL-NEXT: v_mad_u16 v2, v2, v3, 1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 -; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 -; GFX1200-GISEL-NEXT: v_mad_u16 v2, v2, v3, 1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 -; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX1200-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX11-SDAG-TRUE16-LABEL: clpeak_umad_pat_i16_x2: +; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v1.l, v0.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v1.l, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v0.h, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v0.h, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: clpeak_umad_pat_i16_x2: +; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0 +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0 +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0 +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v0 +; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-TRUE16-LABEL: clpeak_umad_pat_i16_x2: +; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, 1 +; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, 1 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1 +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1 +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1 +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-FAKE16-LABEL: clpeak_umad_pat_i16_x2: +; GFX11-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX11-GISEL-FAKE16-NEXT: v_add_nc_u16 v2, v1, 1 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v2, v2, v3, 1 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v2, v2, v3, 1 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v1, v2, v3, 1 +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-SDAG-TRUE16-LABEL: clpeak_umad_pat_i16_x2: +; GFX1200-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX1200-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v1.l, v0.l, v1.l +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v1.l, v0.l +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v0.h, v0.l +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v0.h, v0.l +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v0.l +; GFX1200-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-SDAG-FAKE16-LABEL: clpeak_umad_pat_i16_x2: +; GFX1200-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX1200-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v1, v0, v1, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v1 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1200-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-GISEL-TRUE16-LABEL: clpeak_umad_pat_i16_x2: +; GFX1200-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX1200-GISEL-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, 1 +; GFX1200-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, 1 +; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1 +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1 +; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1 +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1 +; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v0.l, v1.l +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 1 +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v0.h, v1.h +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.h, 1 +; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h +; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1200-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-GISEL-FAKE16-LABEL: clpeak_umad_pat_i16_x2: +; GFX1200-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1200-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX1200-GISEL-FAKE16-NEXT: v_add_nc_u16 v2, v1, 1 +; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v2, v2, v3, 1 +; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v2, v2, v3, 1 +; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v1, v2, v3, 1 +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1200-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %conv69 = add i16 %x, 1 %add = mul i16 %conv69, %y @@ -8792,51 +9298,95 @@ define <2 x i16> @multi_use_mul_mad_i16_var(i16 %x, i16 %y, i16 %z0, i16 %z1) { ; GFX10-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: multi_use_mul_mad_i16_var: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mad_u16 v2, v0, v1, v2 -; GFX11-SDAG-NEXT: v_mad_u16 v0, v0, v1, v3 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: multi_use_mul_mad_i16_var: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mad_u16 v2, v0, v1, v2 -; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v1, v3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX11-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v1 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1200-SDAG-LABEL: multi_use_mul_mad_i16_var: -; GFX1200-SDAG: ; %bb.0: ; %entry -; GFX1200-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_expcnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_samplecnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_bvhcnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1200-SDAG-NEXT: v_mad_u16 v2, v0, v1, v2 -; GFX1200-SDAG-NEXT: v_mad_u16 v0, v0, v1, v3 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-SDAG-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 -; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX1200-GISEL-LABEL: multi_use_mul_mad_i16_var: -; GFX1200-GISEL: ; %bb.0: ; %entry -; GFX1200-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_expcnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_samplecnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_bvhcnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1200-GISEL-NEXT: v_mad_u16 v2, v0, v1, v2 -; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v1, v3 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX1200-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v1 -; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX11-SDAG-TRUE16-LABEL: multi_use_mul_mad_i16_var: +; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v1.l, v2.l +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.l, v3.l +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: multi_use_mul_mad_i16_var: +; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v2, v0, v1, v2 +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v3 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-TRUE16-LABEL: multi_use_mul_mad_i16_var: +; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v2.l, v0.l, v1.l, v2.l +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v2.h, v0.l, v1.l, v3.l +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-FAKE16-LABEL: multi_use_mul_mad_i16_var: +; GFX11-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v2, v0, v1, v2 +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v3 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-GISEL-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-SDAG-TRUE16-LABEL: multi_use_mul_mad_i16_var: +; GFX1200-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX1200-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v1.l, v2.l +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.h, v1.l, v3.l +; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-SDAG-FAKE16-LABEL: multi_use_mul_mad_i16_var: +; GFX1200-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX1200-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v2, v0, v1, v2 +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v3 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 +; GFX1200-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-GISEL-TRUE16-LABEL: multi_use_mul_mad_i16_var: +; GFX1200-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX1200-GISEL-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v2.l, v0.l, v1.l, v2.l +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v2.h, v0.l, v1.l, v3.l +; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-GISEL-TRUE16-NEXT: v_mov_b32_e32 v0, v2 +; GFX1200-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-GISEL-FAKE16-LABEL: multi_use_mul_mad_i16_var: +; GFX1200-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1200-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v2, v0, v1, v2 +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v3 +; GFX1200-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-GISEL-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX1200-GISEL-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX1200-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %mul = mul i16 %x, %y %add0 = add i16 %mul, %z0 @@ -8956,27 +9506,93 @@ define i16 @other_use_mul_mad_i16_var(i16 %x, i16 %y, i16 %z, ptr addrspace(3) % ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: other_use_mul_mad_i16_var: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mul_lo_u16 v4, v0, v1 -; GFX11-NEXT: v_mad_u16 v0, v0, v1, v2 -; GFX11-NEXT: ds_store_b16 v3, v4 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX1200-LABEL: other_use_mul_mad_i16_var: -; GFX1200: ; %bb.0: ; %entry -; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1200-NEXT: s_wait_expcnt 0x0 -; GFX1200-NEXT: s_wait_samplecnt 0x0 -; GFX1200-NEXT: s_wait_bvhcnt 0x0 -; GFX1200-NEXT: s_wait_kmcnt 0x0 -; GFX1200-NEXT: v_mul_lo_u16 v4, v0, v1 -; GFX1200-NEXT: v_mad_u16 v0, v0, v1, v2 -; GFX1200-NEXT: ds_store_b16 v3, v4 -; GFX1200-NEXT: s_wait_dscnt 0x0 -; GFX1200-NEXT: s_setpc_b64 s[30:31] +; GFX11-SDAG-TRUE16-LABEL: other_use_mul_mad_i16_var: +; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_mul_lo_u16 v0.h, v0.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, v2.l +; GFX11-SDAG-TRUE16-NEXT: ds_store_b16_d16_hi v3, v0 +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: other_use_mul_mad_i16_var: +; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_mul_lo_u16 v4, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v2 +; GFX11-SDAG-FAKE16-NEXT: ds_store_b16 v3, v4 +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-TRUE16-LABEL: other_use_mul_mad_i16_var: +; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.h, v0.l, v1.l +; GFX11-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, v2.l +; GFX11-GISEL-TRUE16-NEXT: ds_store_b16_d16_hi v3, v0 +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-FAKE16-LABEL: other_use_mul_mad_i16_var: +; GFX11-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_mul_lo_u16 v4, v0, v1 +; GFX11-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v2 +; GFX11-GISEL-FAKE16-NEXT: ds_store_b16 v3, v4 +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-SDAG-TRUE16-LABEL: other_use_mul_mad_i16_var: +; GFX1200-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX1200-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: v_mul_lo_u16 v0.h, v0.l, v1.l +; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, v2.l +; GFX1200-SDAG-TRUE16-NEXT: ds_store_b16_d16_hi v3, v0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-SDAG-FAKE16-LABEL: other_use_mul_mad_i16_var: +; GFX1200-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX1200-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: v_mul_lo_u16 v4, v0, v1 +; GFX1200-SDAG-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v2 +; GFX1200-SDAG-FAKE16-NEXT: ds_store_b16 v3, v4 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-GISEL-TRUE16-LABEL: other_use_mul_mad_i16_var: +; GFX1200-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX1200-GISEL-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.h, v0.l, v1.l +; GFX1200-GISEL-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, v2.l +; GFX1200-GISEL-TRUE16-NEXT: ds_store_b16_d16_hi v3, v0 +; GFX1200-GISEL-TRUE16-NEXT: s_wait_dscnt 0x0 +; GFX1200-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-GISEL-FAKE16-LABEL: other_use_mul_mad_i16_var: +; GFX1200-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX1200-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: v_mul_lo_u16 v4, v0, v1 +; GFX1200-GISEL-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v2 +; GFX1200-GISEL-FAKE16-NEXT: ds_store_b16 v3, v4 +; GFX1200-GISEL-FAKE16-NEXT: s_wait_dscnt 0x0 +; GFX1200-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %mul = mul i16 %x, %y %add0 = add i16 %mul, %z