diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index c17fda1346115..12716bdde1a9f 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -1678,15 +1678,12 @@ def : FlatLoadPat_D16 ; let OtherPredicates = [HasFlatGlobalInsts] in { defm : GlobalFLATLoadPats ; -defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; -defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; -defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; @@ -1702,6 +1699,9 @@ defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; +defm : GlobalFLATLoadPats ; +defm : GlobalFLATLoadPats ; +defm : GlobalFLATLoadPats ; } let OtherPredicates = [HasFlatGlobalInsts, D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts in { @@ -1709,6 +1709,9 @@ defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", extloadi8_global, i16 defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", zextloadi8_global, i16>; defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_SBYTE_D16", sextloadi8_global, i16>; defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_SHORT_D16", load_global, i16>; +defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", atomic_load_aext_8_global, i16>; +defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", atomic_load_zext_8_global, i16>; +defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_SBYTE_D16", atomic_load_sext_8_global, i16>; defm : GlobalFLATStorePats_D16_t16<"GLOBAL_STORE_BYTE", truncstorei8_global, i16>; defm : GlobalFLATStorePats_D16_t16<"GLOBAL_STORE_SHORT", store_global, i16>; } // end OtherPredicates = [HasFlatGlobalInsts, D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll index 3e15b135eeab9..7889ae636464c 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll @@ -2,6 +2,8 @@ ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=None -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-atomic-optimizer-strategy=None -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-atomic-optimizer-strategy=None -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s define amdgpu_kernel void @atomic_add_i32_offset(ptr addrspace(1) %out, i32 %in) { ; SI-LABEL: atomic_add_i32_offset: @@ -41,6 +43,19 @@ define amdgpu_kernel void @atomic_add_i32_offset(ptr addrspace(1) %out, i32 %in) ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_add_i32_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:16 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst @@ -89,6 +104,19 @@ define amdgpu_kernel void @atomic_add_i32_max_neg_offset(ptr addrspace(1) %out, ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_add_i32_max_neg_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:-4096 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 -1024 %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst @@ -135,6 +163,19 @@ define amdgpu_kernel void @atomic_add_i32_soffset(ptr addrspace(1) %out, i32 %in ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_add_i32_soffset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0x8000 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:3232 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 9000 %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst @@ -185,6 +226,21 @@ define amdgpu_kernel void @atomic_add_i32_huge_offset(ptr addrspace(1) %out, i32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_add_i32_huge_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: s_add_u32 s0, s0, 0xdeac +; GFX11-NEXT: s_addc_u32 s1, s1, 0xabcd +; GFX11-NEXT: global_atomic_add_u32 v0, v1, s[0:1] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 47224239175595 @@ -241,6 +297,20 @@ define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr addrspace(1) %out, ptr ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_add_i32_ret_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 +; GFX11-NEXT: global_atomic_add_u32 v1, v0, v1, s[0:1] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst @@ -300,6 +370,24 @@ define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr addrspace(1) %out, i ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_add_i32_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: global_atomic_add_u32 v0, v1, s[0:1] offset:16 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 @@ -370,6 +458,25 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr addrspace(1) %ou ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_add_i32_ret_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: v_mov_b32_e32 v1, s8 +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: global_atomic_add_u32 v1, v0, v1, s[0:1] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 @@ -416,6 +523,19 @@ define amdgpu_kernel void @atomic_add_i32(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_add_i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_atomic_add_u32 v0, v1, s[0:1] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %val = atomicrmw volatile add ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst ret void @@ -470,6 +590,20 @@ define amdgpu_kernel void @atomic_add_i32_ret(ptr addrspace(1) %out, ptr addrspa ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_add_i32_ret: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 +; GFX11-NEXT: global_atomic_add_u32 v1, v0, v1, s[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm entry: %val = atomicrmw volatile add ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst store i32 %val, ptr addrspace(1) %out2 @@ -526,6 +660,24 @@ define amdgpu_kernel void @atomic_add_i32_addr64(ptr addrspace(1) %out, i32 %in, ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_add_i32_addr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: global_atomic_add_u32 v0, v1, s[0:1] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %val = atomicrmw volatile add ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst @@ -593,6 +745,25 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_add_i32_ret_addr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: v_mov_b32_e32 v1, s8 +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: global_atomic_add_u32 v1, v0, v1, s[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %val = atomicrmw volatile add ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst @@ -638,6 +809,19 @@ define amdgpu_kernel void @atomic_and_i32_offset(ptr addrspace(1) %out, i32 %in) ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_and_i32_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_atomic_and_b32 v0, v1, s[0:1] offset:16 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst @@ -693,6 +877,20 @@ define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr addrspace(1) %out, ptr ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_and_i32_ret_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 +; GFX11-NEXT: global_atomic_and_b32 v1, v0, v1, s[0:1] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst @@ -752,6 +950,24 @@ define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr addrspace(1) %out, i ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_and_i32_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: global_atomic_and_b32 v0, v1, s[0:1] offset:16 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 @@ -822,6 +1038,25 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr addrspace(1) %ou ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_and_i32_ret_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: v_mov_b32_e32 v1, s8 +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: global_atomic_and_b32 v1, v0, v1, s[0:1] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 @@ -868,6 +1103,19 @@ define amdgpu_kernel void @atomic_and_i32(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_and_i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_atomic_and_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %val = atomicrmw volatile and ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst ret void @@ -922,6 +1170,20 @@ define amdgpu_kernel void @atomic_and_i32_ret(ptr addrspace(1) %out, ptr addrspa ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_and_i32_ret: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 +; GFX11-NEXT: global_atomic_and_b32 v1, v0, v1, s[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm entry: %val = atomicrmw volatile and ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst store i32 %val, ptr addrspace(1) %out2 @@ -978,6 +1240,24 @@ define amdgpu_kernel void @atomic_and_i32_addr64(ptr addrspace(1) %out, i32 %in, ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_and_i32_addr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: global_atomic_and_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %val = atomicrmw volatile and ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst @@ -1045,6 +1325,25 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_and_i32_ret_addr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: v_mov_b32_e32 v1, s8 +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: global_atomic_and_b32 v1, v0, v1, s[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %val = atomicrmw volatile and ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst @@ -1090,6 +1389,19 @@ define amdgpu_kernel void @atomic_sub_i32_offset(ptr addrspace(1) %out, i32 %in) ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_sub_i32_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_atomic_sub_u32 v0, v1, s[0:1] offset:16 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst @@ -1145,6 +1457,20 @@ define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr addrspace(1) %out, ptr ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_sub_i32_ret_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 +; GFX11-NEXT: global_atomic_sub_u32 v1, v0, v1, s[0:1] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst @@ -1204,6 +1530,24 @@ define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr addrspace(1) %out, i ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_sub_i32_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: global_atomic_sub_u32 v0, v1, s[0:1] offset:16 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 @@ -1274,6 +1618,25 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr addrspace(1) %ou ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_sub_i32_ret_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: v_mov_b32_e32 v1, s8 +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: global_atomic_sub_u32 v1, v0, v1, s[0:1] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 @@ -1320,6 +1683,19 @@ define amdgpu_kernel void @atomic_sub_i32(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_sub_i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_atomic_sub_u32 v0, v1, s[0:1] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %val = atomicrmw volatile sub ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst ret void @@ -1374,6 +1750,20 @@ define amdgpu_kernel void @atomic_sub_i32_ret(ptr addrspace(1) %out, ptr addrspa ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_sub_i32_ret: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 +; GFX11-NEXT: global_atomic_sub_u32 v1, v0, v1, s[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm entry: %val = atomicrmw volatile sub ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst store i32 %val, ptr addrspace(1) %out2 @@ -1430,6 +1820,24 @@ define amdgpu_kernel void @atomic_sub_i32_addr64(ptr addrspace(1) %out, i32 %in, ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_sub_i32_addr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: global_atomic_sub_u32 v0, v1, s[0:1] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %val = atomicrmw volatile sub ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst @@ -1497,6 +1905,25 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_sub_i32_ret_addr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: v_mov_b32_e32 v1, s8 +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: global_atomic_sub_u32 v1, v0, v1, s[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %val = atomicrmw volatile sub ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst @@ -1542,6 +1969,19 @@ define amdgpu_kernel void @atomic_max_i32_offset(ptr addrspace(1) %out, i32 %in) ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_max_i32_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_atomic_max_i32 v0, v1, s[0:1] offset:16 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst @@ -1594,6 +2034,19 @@ define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr addrspace(1) %out, ptr ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_max_i32_ret_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 +; GFX11-NEXT: global_atomic_max_i32 v1, v0, v1, s[0:1] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst @@ -1647,6 +2100,23 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: global_atomic_smax v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_max_i32_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: global_atomic_max_i32 v0, v1, s[0:1] offset:16 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 @@ -1714,6 +2184,24 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_max_i32_ret_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: v_mov_b32_e32 v1, s8 +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: global_atomic_max_i32 v1, v0, v1, s[0:1] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 @@ -1754,6 +2242,18 @@ define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_smax v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_max_i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_atomic_max_i32 v0, v1, s[0:1] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %val = atomicrmw volatile max ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst ret void @@ -1805,6 +2305,19 @@ define amdgpu_kernel void @atomic_max_i32_ret(ptr addrspace(1) %out, ptr addrspa ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_max_i32_ret: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 +; GFX11-NEXT: global_atomic_max_i32 v1, v0, v1, s[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm entry: %val = atomicrmw volatile max ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst store i32 %val, ptr addrspace(1) %out2 @@ -1855,6 +2368,23 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: global_atomic_smax v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_max_i32_addr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: global_atomic_max_i32 v0, v1, s[0:1] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %val = atomicrmw volatile max ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst @@ -1919,6 +2449,24 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_max_i32_ret_addr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: v_mov_b32_e32 v1, s8 +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: global_atomic_max_i32 v1, v0, v1, s[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %val = atomicrmw volatile max ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst @@ -1958,6 +2506,18 @@ define amdgpu_kernel void @atomic_umax_i32_offset(ptr addrspace(1) %out, i32 %in ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_umax v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_umax_i32_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_atomic_max_u32 v0, v1, s[0:1] offset:16 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst @@ -2010,6 +2570,19 @@ define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr addrspace(1) %out, ptr ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_umax_i32_ret_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 +; GFX11-NEXT: global_atomic_max_u32 v1, v0, v1, s[0:1] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst @@ -2063,6 +2636,23 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: global_atomic_umax v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_umax_i32_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: global_atomic_max_u32 v0, v1, s[0:1] offset:16 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 @@ -2130,6 +2720,24 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_umax_i32_ret_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: v_mov_b32_e32 v1, s8 +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: global_atomic_max_u32 v1, v0, v1, s[0:1] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 @@ -2170,6 +2778,18 @@ define amdgpu_kernel void @atomic_umax_i32(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_umax v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_umax_i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_atomic_max_u32 v0, v1, s[0:1] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %val = atomicrmw volatile umax ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst ret void @@ -2221,6 +2841,19 @@ define amdgpu_kernel void @atomic_umax_i32_ret(ptr addrspace(1) %out, ptr addrsp ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_umax_i32_ret: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 +; GFX11-NEXT: global_atomic_max_u32 v1, v0, v1, s[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm entry: %val = atomicrmw volatile umax ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst store i32 %val, ptr addrspace(1) %out2 @@ -2271,6 +2904,23 @@ define amdgpu_kernel void @atomic_umax_i32_addr64(ptr addrspace(1) %out, i32 %in ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: global_atomic_umax v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_umax_i32_addr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: global_atomic_max_u32 v0, v1, s[0:1] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %val = atomicrmw volatile umax ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst @@ -2335,6 +2985,24 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_umax_i32_ret_addr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: v_mov_b32_e32 v1, s8 +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: global_atomic_max_u32 v1, v0, v1, s[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %val = atomicrmw volatile umax ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst @@ -2374,6 +3042,18 @@ define amdgpu_kernel void @atomic_min_i32_offset(ptr addrspace(1) %out, i32 %in) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_smin v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_min_i32_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_atomic_min_i32 v0, v1, s[0:1] offset:16 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst @@ -2426,6 +3106,19 @@ define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr addrspace(1) %out, ptr ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_min_i32_ret_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 +; GFX11-NEXT: global_atomic_min_i32 v1, v0, v1, s[0:1] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst @@ -2479,6 +3172,23 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: global_atomic_smin v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_min_i32_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: global_atomic_min_i32 v0, v1, s[0:1] offset:16 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 @@ -2546,6 +3256,24 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_min_i32_ret_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: v_mov_b32_e32 v1, s8 +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: global_atomic_min_i32 v1, v0, v1, s[0:1] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 @@ -2586,6 +3314,18 @@ define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_smin v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_min_i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_atomic_min_i32 v0, v1, s[0:1] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %val = atomicrmw volatile min ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst ret void @@ -2637,6 +3377,19 @@ define amdgpu_kernel void @atomic_min_i32_ret(ptr addrspace(1) %out, ptr addrspa ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_min_i32_ret: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 +; GFX11-NEXT: global_atomic_min_i32 v1, v0, v1, s[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm entry: %val = atomicrmw volatile min ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst store i32 %val, ptr addrspace(1) %out2 @@ -2687,6 +3440,23 @@ define amdgpu_kernel void @atomic_min_i32_addr64(ptr addrspace(1) %out, i32 %in, ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: global_atomic_smin v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_min_i32_addr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: global_atomic_min_i32 v0, v1, s[0:1] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %val = atomicrmw volatile min ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst @@ -2751,6 +3521,24 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_min_i32_ret_addr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: v_mov_b32_e32 v1, s8 +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: global_atomic_min_i32 v1, v0, v1, s[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %val = atomicrmw volatile min ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst @@ -2790,6 +3578,18 @@ define amdgpu_kernel void @atomic_umin_i32_offset(ptr addrspace(1) %out, i32 %in ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_umin v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_umin_i32_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_atomic_min_u32 v0, v1, s[0:1] offset:16 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %val = atomicrmw volatile umin ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst @@ -2842,6 +3642,19 @@ define amdgpu_kernel void @atomic_umin_i32_ret_offset(ptr addrspace(1) %out, ptr ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_umin_i32_ret_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 +; GFX11-NEXT: global_atomic_min_u32 v1, v0, v1, s[0:1] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %val = atomicrmw volatile umin ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst @@ -2895,6 +3708,23 @@ define amdgpu_kernel void @atomic_umin_i32_addr64_offset(ptr addrspace(1) %out, ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: global_atomic_umin v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_umin_i32_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: global_atomic_min_u32 v0, v1, s[0:1] offset:16 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 @@ -2962,6 +3792,24 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr addrspace(1) %o ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_umin_i32_ret_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: v_mov_b32_e32 v1, s8 +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: global_atomic_min_u32 v1, v0, v1, s[0:1] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 @@ -3002,6 +3850,18 @@ define amdgpu_kernel void @atomic_umin_i32(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_atomic_umin v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_umin_i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_atomic_min_u32 v0, v1, s[0:1] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %val = atomicrmw volatile umin ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst ret void @@ -3053,6 +3913,19 @@ define amdgpu_kernel void @atomic_umin_i32_ret(ptr addrspace(1) %out, ptr addrsp ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_umin_i32_ret: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 +; GFX11-NEXT: global_atomic_min_u32 v1, v0, v1, s[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm entry: %val = atomicrmw volatile umin ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst store i32 %val, ptr addrspace(1) %out2 @@ -3103,6 +3976,23 @@ define amdgpu_kernel void @atomic_umin_i32_addr64(ptr addrspace(1) %out, i32 %in ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: global_atomic_umin v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_umin_i32_addr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: global_atomic_min_u32 v0, v1, s[0:1] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %val = atomicrmw volatile umin ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst @@ -3167,6 +4057,24 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_umin_i32_ret_addr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: v_mov_b32_e32 v1, s8 +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: global_atomic_min_u32 v1, v0, v1, s[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %val = atomicrmw volatile umin ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst @@ -3212,6 +4120,19 @@ define amdgpu_kernel void @atomic_or_i32_offset(ptr addrspace(1) %out, i32 %in) ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_or_i32_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_atomic_or_b32 v0, v1, s[0:1] offset:16 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %val = atomicrmw volatile or ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst @@ -3267,6 +4188,20 @@ define amdgpu_kernel void @atomic_or_i32_ret_offset(ptr addrspace(1) %out, ptr a ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_or_i32_ret_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 +; GFX11-NEXT: global_atomic_or_b32 v1, v0, v1, s[0:1] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %val = atomicrmw volatile or ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst @@ -3326,6 +4261,24 @@ define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr addrspace(1) %out, i3 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_or_i32_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: global_atomic_or_b32 v0, v1, s[0:1] offset:16 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 @@ -3396,6 +4349,25 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr addrspace(1) %out ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_or_i32_ret_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: v_mov_b32_e32 v1, s8 +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: global_atomic_or_b32 v1, v0, v1, s[0:1] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 @@ -3442,6 +4414,19 @@ define amdgpu_kernel void @atomic_or_i32(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_or_i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_atomic_or_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %val = atomicrmw volatile or ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst ret void @@ -3496,6 +4481,20 @@ define amdgpu_kernel void @atomic_or_i32_ret(ptr addrspace(1) %out, ptr addrspac ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_or_i32_ret: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 +; GFX11-NEXT: global_atomic_or_b32 v1, v0, v1, s[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm entry: %val = atomicrmw volatile or ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst store i32 %val, ptr addrspace(1) %out2 @@ -3552,6 +4551,24 @@ define amdgpu_kernel void @atomic_or_i32_addr64(ptr addrspace(1) %out, i32 %in, ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_or_i32_addr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: global_atomic_or_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %val = atomicrmw volatile or ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst @@ -3619,6 +4636,25 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr addrspace(1) %out, ptr a ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_or_i32_ret_addr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: v_mov_b32_e32 v1, s8 +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: global_atomic_or_b32 v1, v0, v1, s[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %val = atomicrmw volatile or ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst @@ -3664,6 +4700,19 @@ define amdgpu_kernel void @atomic_xchg_i32_offset(ptr addrspace(1) %out, i32 %in ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_xchg_i32_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] offset:16 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %val = atomicrmw volatile xchg ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst @@ -3708,6 +4757,19 @@ define amdgpu_kernel void @atomic_xchg_f32_offset(ptr addrspace(1) %out, float % ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_xchg_f32_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] offset:16 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr float, ptr addrspace(1) %out, i64 4 %val = atomicrmw volatile xchg ptr addrspace(1) %gep, float %in syncscope("agent") seq_cst @@ -3763,6 +4825,20 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_offset(ptr addrspace(1) %out, ptr ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_xchg_i32_ret_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 +; GFX11-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %val = atomicrmw volatile xchg ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst @@ -3822,6 +4898,24 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr addrspace(1) %out, ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_xchg_i32_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] offset:16 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 @@ -3892,6 +4986,25 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr addrspace(1) %o ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_xchg_i32_ret_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: v_mov_b32_e32 v1, s8 +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 @@ -3938,6 +5051,19 @@ define amdgpu_kernel void @atomic_xchg_i32(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_xchg_i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst ret void @@ -3992,6 +5118,20 @@ define amdgpu_kernel void @atomic_xchg_i32_ret(ptr addrspace(1) %out, ptr addrsp ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_xchg_i32_ret: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 +; GFX11-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst store i32 %val, ptr addrspace(1) %out2 @@ -4048,6 +5188,24 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr addrspace(1) %out, i32 %in ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_xchg_i32_addr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %val = atomicrmw volatile xchg ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst @@ -4115,6 +5273,25 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_xchg_i32_ret_addr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: v_mov_b32_e32 v1, s8 +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %val = atomicrmw volatile xchg ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst @@ -4164,6 +5341,18 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr addrspace(1) %out, i32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_cmpxchg_i32_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst @@ -4222,6 +5411,21 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr addrspace(1) %out, ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dword v2, v0, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_cmpxchg_i32_ret_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v2, v0, s[2:3] +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst @@ -4288,6 +5492,25 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr addrspace(1) %ou ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_cmpxchg_i32_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX11-NEXT: s_load_b32 s7, s[4:5], 0x3c +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 @@ -4364,6 +5587,26 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr addrspace(1) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dword v2, v0, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_cmpxchg_i32_ret_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX11-NEXT: s_load_b32 s9, s[4:5], 0x44 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v2, v0, s[2:3] +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 @@ -4415,6 +5658,18 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr addrspace(1) %out, i32 %in, i3 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_cmpxchg_i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %val = cmpxchg volatile ptr addrspace(1) %out, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst ret void @@ -4472,6 +5727,21 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr addrspace(1) %out, ptr add ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dword v2, v0, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_cmpxchg_i32_ret: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v2, v0, s[2:3] +; GFX11-NEXT: s_endpgm entry: %val = cmpxchg volatile ptr addrspace(1) %out, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst %extract0 = extractvalue { i32, i1 } %val, 0 @@ -4535,6 +5805,25 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr addrspace(1) %out, i32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_cmpxchg_i32_addr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX11-NEXT: s_load_b32 s7, s[4:5], 0x3c +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %val = cmpxchg volatile ptr addrspace(1) %ptr, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst @@ -4608,6 +5897,26 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr addrspace(1) %out, ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dword v2, v0, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_cmpxchg_i32_ret_addr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX11-NEXT: s_load_b32 s9, s[4:5], 0x44 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v2, v0, s[2:3] +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %val = cmpxchg volatile ptr addrspace(1) %ptr, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst @@ -4654,6 +5963,19 @@ define amdgpu_kernel void @atomic_xor_i32_offset(ptr addrspace(1) %out, i32 %in) ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_xor_i32_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_atomic_xor_b32 v0, v1, s[0:1] offset:16 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %val = atomicrmw volatile xor ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst @@ -4709,6 +6031,20 @@ define amdgpu_kernel void @atomic_xor_i32_ret_offset(ptr addrspace(1) %out, ptr ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_xor_i32_ret_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 +; GFX11-NEXT: global_atomic_xor_b32 v1, v0, v1, s[0:1] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %val = atomicrmw volatile xor ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst @@ -4768,6 +6104,24 @@ define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr addrspace(1) %out, i ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_xor_i32_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: global_atomic_xor_b32 v0, v1, s[0:1] offset:16 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 @@ -4838,6 +6192,25 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr addrspace(1) %ou ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_xor_i32_ret_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: v_mov_b32_e32 v1, s8 +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: global_atomic_xor_b32 v1, v0, v1, s[0:1] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 @@ -4884,6 +6257,19 @@ define amdgpu_kernel void @atomic_xor_i32(ptr addrspace(1) %out, i32 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_xor_i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_atomic_xor_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %val = atomicrmw volatile xor ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst ret void @@ -4938,6 +6324,20 @@ define amdgpu_kernel void @atomic_xor_i32_ret(ptr addrspace(1) %out, ptr addrspa ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_xor_i32_ret: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 +; GFX11-NEXT: global_atomic_xor_b32 v1, v0, v1, s[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm entry: %val = atomicrmw volatile xor ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst store i32 %val, ptr addrspace(1) %out2 @@ -4994,6 +6394,24 @@ define amdgpu_kernel void @atomic_xor_i32_addr64(ptr addrspace(1) %out, i32 %in, ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_xor_i32_addr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: global_atomic_xor_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %val = atomicrmw volatile xor ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst @@ -5061,6 +6479,25 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr addrspace(1) %out, ptr ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_xor_i32_ret_addr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: v_mov_b32_e32 v1, s8 +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: global_atomic_xor_b32 v1, v0, v1, s[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %val = atomicrmw volatile xor ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst @@ -5113,6 +6550,18 @@ define amdgpu_kernel void @atomic_load_i32_offset(ptr addrspace(1) %in, ptr addr ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_load_i32_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %in, i64 4 %val = load atomic i32, ptr addrspace(1) %gep seq_cst, align 4 @@ -5167,6 +6616,18 @@ define amdgpu_kernel void @atomic_load_i32_negoffset(ptr addrspace(1) %in, ptr a ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_load_i32_negoffset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] offset:-512 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %in, i64 -128 %val = load atomic i32, ptr addrspace(1) %gep seq_cst, align 4 @@ -5219,6 +6680,18 @@ define amdgpu_kernel void @atomic_load_f32_offset(ptr addrspace(1) %in, ptr addr ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_load_f32_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr float, ptr addrspace(1) %in, i64 4 %val = load atomic float, ptr addrspace(1) %gep seq_cst, align 4 @@ -5269,6 +6742,18 @@ define amdgpu_kernel void @atomic_load_i32(ptr addrspace(1) %in, ptr addrspace(1 ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_load_i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("agent") seq_cst, align 4 store i32 %val, ptr addrspace(1) %out @@ -5332,6 +6817,24 @@ define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr addrspace(1) %in, p ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_load_i32_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %in, i64 %index %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 @@ -5395,6 +6898,24 @@ define amdgpu_kernel void @atomic_load_i32_addr64(ptr addrspace(1) %in, ptr addr ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_load_i32_addr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %in, i64 %index %val = load atomic i32, ptr addrspace(1) %ptr seq_cst, align 4 @@ -5459,6 +6980,24 @@ define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr addrspace(1) %in, p ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_load_f32_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr float, ptr addrspace(1) %in, i64 %index %gep = getelementptr float, ptr addrspace(1) %ptr, i64 4 @@ -5501,6 +7040,16 @@ define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, ptr addrspace(1) %ou ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_i32_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] offset:16 +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 store atomic i32 %in, ptr addrspace(1) %gep seq_cst, align 4 @@ -5539,6 +7088,16 @@ define amdgpu_kernel void @atomic_store_i32(i32 %in, ptr addrspace(1) %out) { ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_endpgm entry: store atomic i32 %in, ptr addrspace(1) %out seq_cst, align 4 ret void @@ -5576,6 +7135,16 @@ define amdgpu_kernel void @atomic_store_f32(float %in, ptr addrspace(1) %out) { ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_f32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_endpgm entry: store atomic float %in, ptr addrspace(1) %out seq_cst, align 4 ret void @@ -5624,6 +7193,20 @@ define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, ptr addrspace ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_i32_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: s_add_u32 s0, s0, s2 +; GFX11-NEXT: s_addc_u32 s1, s1, s3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] offset:16 +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 @@ -5674,6 +7257,20 @@ define amdgpu_kernel void @atomic_store_f32_addr64_offset(float %in, ptr addrspa ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_f32_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: s_add_u32 s0, s0, s2 +; GFX11-NEXT: s_addc_u32 s1, s1, s3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] offset:16 +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr float, ptr addrspace(1) %out, i64 %index %gep = getelementptr float, ptr addrspace(1) %ptr, i64 4 @@ -5723,6 +7320,20 @@ define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, ptr addrspace(1) %ou ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_i32_addr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: s_add_u32 s0, s0, s2 +; GFX11-NEXT: s_addc_u32 s1, s1, s3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index store atomic i32 %in, ptr addrspace(1) %ptr seq_cst, align 4 @@ -5771,6 +7382,20 @@ define amdgpu_kernel void @atomic_store_f32_addr64(float %in, ptr addrspace(1) % ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_f32_addr64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: s_add_u32 s0, s0, s2 +; GFX11-NEXT: s_addc_u32 s1, s1, s3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr float, ptr addrspace(1) %out, i64 %index store atomic float %in, ptr addrspace(1) %ptr seq_cst, align 4 @@ -5820,6 +7445,30 @@ define amdgpu_kernel void @atomic_load_i8_offset(ptr addrspace(1) %in, ptr addrs ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-TRUE16-LABEL: atomic_load_i8_offset: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_d16_u8 v0, v1, s[0:1] offset:16 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: global_store_b8 v1, v0, s[2:3] +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: atomic_load_i8_offset: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_u8 v1, v0, s[0:1] offset:16 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: global_store_b8 v0, v1, s[2:3] +; GFX11-FAKE16-NEXT: s_endpgm entry: %gep = getelementptr i8, ptr addrspace(1) %in, i64 16 %val = load atomic i8, ptr addrspace(1) %gep seq_cst, align 1 @@ -5874,6 +7523,30 @@ define amdgpu_kernel void @atomic_load_i8_negoffset(ptr addrspace(1) %in, ptr ad ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-TRUE16-LABEL: atomic_load_i8_negoffset: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_d16_u8 v0, v1, s[0:1] offset:-512 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: global_store_b8 v1, v0, s[2:3] +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: atomic_load_i8_negoffset: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_u8 v1, v0, s[0:1] offset:-512 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: global_store_b8 v0, v1, s[2:3] +; GFX11-FAKE16-NEXT: s_endpgm entry: %gep = getelementptr i8, ptr addrspace(1) %in, i64 -512 %val = load atomic i8, ptr addrspace(1) %gep seq_cst, align 1 @@ -5915,6 +7588,16 @@ define amdgpu_kernel void @atomic_store_i8_offset(i8 %in, ptr addrspace(1) %out) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_byte v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_i8_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b8 v0, v1, s[0:1] offset:16 +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i8, ptr addrspace(1) %out, i64 16 store atomic i8 %in, ptr addrspace(1) %gep seq_cst, align 1 @@ -5953,6 +7636,16 @@ define amdgpu_kernel void @atomic_store_i8(i8 %in, ptr addrspace(1) %out) { ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_i8: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b8 v0, v1, s[0:1] +; GFX11-NEXT: s_endpgm entry: store atomic i8 %in, ptr addrspace(1) %out seq_cst, align 1 ret void @@ -6001,6 +7694,30 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr addrspace(1) %in, ptr addr ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-TRUE16-LABEL: atomic_load_i16_offset: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] offset:16 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[2:3] +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: atomic_load_i16_offset: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] offset:16 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[2:3] +; GFX11-FAKE16-NEXT: s_endpgm entry: %gep = getelementptr i16, ptr addrspace(1) %in, i64 8 %val = load atomic i16, ptr addrspace(1) %gep seq_cst, align 2 @@ -6055,6 +7772,30 @@ define amdgpu_kernel void @atomic_load_i16_negoffset(ptr addrspace(1) %in, ptr a ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-TRUE16-LABEL: atomic_load_i16_negoffset: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] offset:-512 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[2:3] +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: atomic_load_i16_negoffset: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] offset:-512 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[2:3] +; GFX11-FAKE16-NEXT: s_endpgm entry: %gep = getelementptr i16, ptr addrspace(1) %in, i64 -256 %val = load atomic i16, ptr addrspace(1) %gep seq_cst, align 2 @@ -6096,6 +7837,16 @@ define amdgpu_kernel void @atomic_store_i16_offset(i16 %in, ptr addrspace(1) %ou ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_i16_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] offset:16 +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i16, ptr addrspace(1) %out, i64 8 store atomic i16 %in, ptr addrspace(1) %gep seq_cst, align 2 @@ -6134,6 +7885,16 @@ define amdgpu_kernel void @atomic_store_i16(i16 %in, ptr addrspace(1) %out) { ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_i16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_endpgm entry: store atomic i16 %in, ptr addrspace(1) %out seq_cst, align 2 ret void @@ -6173,6 +7934,16 @@ define amdgpu_kernel void @atomic_store_f16_offset(half %in, ptr addrspace(1) %o ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_f16_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] offset:16 +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr half, ptr addrspace(1) %out, i64 8 store atomic half %in, ptr addrspace(1) %gep seq_cst, align 2 @@ -6211,6 +7982,16 @@ define amdgpu_kernel void @atomic_store_f16(half %in, ptr addrspace(1) %out) { ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_f16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_endpgm entry: store atomic half %in, ptr addrspace(1) %out seq_cst, align 2 ret void @@ -6250,6 +8031,16 @@ define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] offset:16 ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_bf16_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] offset:16 +; GFX11-NEXT: s_endpgm %gep = getelementptr bfloat, ptr addrspace(1) %out, i64 8 store atomic bfloat %in, ptr addrspace(1) %gep seq_cst, align 2 ret void @@ -6287,6 +8078,16 @@ define amdgpu_kernel void @atomic_store_bf16(bfloat %in, ptr addrspace(1) %out) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_endpgm store atomic bfloat %in, ptr addrspace(1) %out seq_cst, align 2 ret void } @@ -6329,6 +8130,19 @@ define amdgpu_kernel void @atomic_inc_i32_offset(ptr addrspace(1) %out, i32 %in) ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_inc_i32_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_atomic_inc_u32 v0, v1, s[0:1] offset:16 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst @@ -6377,6 +8191,19 @@ define amdgpu_kernel void @atomic_inc_i32_max_neg_offset(ptr addrspace(1) %out, ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_inc_i32_max_neg_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_atomic_inc_u32 v0, v1, s[0:1] offset:-4096 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 -1024 %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst @@ -6423,6 +8250,19 @@ define amdgpu_kernel void @atomic_inc_i32_soffset(ptr addrspace(1) %out, i32 %in ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_inc_i32_soffset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0x8000 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_atomic_inc_u32 v0, v1, s[0:1] offset:3232 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 9000 %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst @@ -6473,6 +8313,21 @@ define amdgpu_kernel void @atomic_inc_i32_huge_offset(ptr addrspace(1) %out, i32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_inc_i32_huge_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: s_add_u32 s0, s0, 0xdeac +; GFX11-NEXT: s_addc_u32 s1, s1, 0xabcd +; GFX11-NEXT: global_atomic_inc_u32 v0, v1, s[0:1] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 47224239175595 %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst @@ -6528,6 +8383,20 @@ define amdgpu_kernel void @atomic_inc_i32_ret_offset(ptr addrspace(1) %out, ptr ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_inc_i32_ret_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 +; GFX11-NEXT: global_atomic_inc_u32 v1, v0, v1, s[0:1] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst @@ -6587,6 +8456,24 @@ define amdgpu_kernel void @atomic_inc_i32_addr64_offset(ptr addrspace(1) %out, i ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_inc_i32_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: global_atomic_inc_u32 v0, v1, s[0:1] offset:16 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 @@ -6657,6 +8544,25 @@ define amdgpu_kernel void @atomic_inc_i32_ret_addr64_offset(ptr addrspace(1) %ou ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_inc_i32_ret_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: v_mov_b32_e32 v1, s8 +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: global_atomic_inc_u32 v1, v0, v1, s[0:1] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 @@ -6703,6 +8609,19 @@ define amdgpu_kernel void @atomic_dec_i32_offset(ptr addrspace(1) %out, i32 %in) ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_dec_i32_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_atomic_dec_u32 v0, v1, s[0:1] offset:16 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst @@ -6751,6 +8670,19 @@ define amdgpu_kernel void @atomic_dec_i32_max_neg_offset(ptr addrspace(1) %out, ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_dec_i32_max_neg_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_atomic_dec_u32 v0, v1, s[0:1] offset:-4096 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 -1024 %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst @@ -6797,6 +8729,19 @@ define amdgpu_kernel void @atomic_dec_i32_soffset(ptr addrspace(1) %out, i32 %in ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_dec_i32_soffset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0x8000 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_atomic_dec_u32 v0, v1, s[0:1] offset:3232 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 9000 %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst @@ -6847,6 +8792,21 @@ define amdgpu_kernel void @atomic_dec_i32_huge_offset(ptr addrspace(1) %out, i32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_dec_i32_huge_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: s_add_u32 s0, s0, 0xdeac +; GFX11-NEXT: s_addc_u32 s1, s1, 0xabcd +; GFX11-NEXT: global_atomic_dec_u32 v0, v1, s[0:1] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 47224239175595 %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst @@ -6902,6 +8862,20 @@ define amdgpu_kernel void @atomic_dec_i32_ret_offset(ptr addrspace(1) %out, ptr ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_dec_i32_ret_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 +; GFX11-NEXT: global_atomic_dec_u32 v1, v0, v1, s[0:1] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst @@ -6961,6 +8935,24 @@ define amdgpu_kernel void @atomic_dec_i32_addr64_offset(ptr addrspace(1) %out, i ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_dec_i32_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX11-NEXT: v_mov_b32_e32 v1, s6 +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: global_atomic_dec_u32 v0, v1, s[0:1] offset:16 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 @@ -7031,6 +9023,25 @@ define amdgpu_kernel void @atomic_dec_i32_ret_addr64_offset(ptr addrspace(1) %ou ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_dec_i32_ret_addr64_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c +; GFX11-NEXT: s_load_b32 s8, s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 2 +; GFX11-NEXT: v_mov_b32_e32 v1, s8 +; GFX11-NEXT: s_add_u32 s0, s0, s4 +; GFX11-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-NEXT: global_atomic_dec_u32 v1, v0, v1, s[0:1] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm entry: %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4 @@ -7082,6 +9093,30 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr addrspace(1) %in, ptr addr ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-TRUE16-LABEL: atomic_load_f16_offset: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] offset:16 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[2:3] +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: atomic_load_f16_offset: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] offset:16 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[2:3] +; GFX11-FAKE16-NEXT: s_endpgm %gep = getelementptr half, ptr addrspace(1) %in, i64 8 %val = load atomic half, ptr addrspace(1) %gep seq_cst, align 2 store half %val, ptr addrspace(1) %out @@ -7135,6 +9170,30 @@ define amdgpu_kernel void @atomic_load_f16_negoffset(ptr addrspace(1) %in, ptr a ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-TRUE16-LABEL: atomic_load_f16_negoffset: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] offset:-512 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[2:3] +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: atomic_load_f16_negoffset: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] offset:-512 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[2:3] +; GFX11-FAKE16-NEXT: s_endpgm %gep = getelementptr half, ptr addrspace(1) %in, i64 -256 %val = load atomic half, ptr addrspace(1) %gep seq_cst, align 2 store half %val, ptr addrspace(1) %out @@ -7184,6 +9243,30 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr addrspace(1) %in, ptr add ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-TRUE16-LABEL: atomic_load_bf16_offset: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] offset:16 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[2:3] +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: atomic_load_bf16_offset: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] offset:16 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[2:3] +; GFX11-FAKE16-NEXT: s_endpgm %gep = getelementptr bfloat, ptr addrspace(1) %in, i64 8 %val = load atomic bfloat, ptr addrspace(1) %gep seq_cst, align 2 store bfloat %val, ptr addrspace(1) %out @@ -7237,6 +9320,30 @@ define amdgpu_kernel void @atomic_load_bf16_negoffset(ptr addrspace(1) %in, ptr ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-TRUE16-LABEL: atomic_load_bf16_negoffset: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] offset:-512 glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[2:3] +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: atomic_load_bf16_negoffset: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] offset:-512 glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[2:3] +; GFX11-FAKE16-NEXT: s_endpgm %gep = getelementptr bfloat, ptr addrspace(1) %in, i64 -256 %val = load atomic bfloat, ptr addrspace(1) %gep seq_cst, align 2 store bfloat %val, ptr addrspace(1) %out