From 7801f98f725340bbf74128160f33196016276234 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Thu, 19 Dec 2024 12:19:45 +0000 Subject: [PATCH 1/3] [AMDGPU] Add some more GFX12 test coverage --- .../AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll | 567 +++++++++++ .../inst-select-scalar-float-sop1.mir | 1 + .../inst-select-scalar-float-sop2.mir | 1 + .../inst-select-scalar-float-sopc.mir | 1 + .../GlobalISel/llvm.amdgcn.interp.inreg.ll | 168 ++++ llvm/test/CodeGen/AMDGPU/fmaximum3.ll | 66 ++ .../global-saddr-atomics-min-max-system.ll | 905 ++++++++++++++++++ .../CodeGen/AMDGPU/global-saddr-atomics.ll | 787 +++++++++++++++ .../AMDGPU/llvm.amdgcn.interp.inreg.ll | 118 +++ .../AMDGPU/llvm.amdgcn.intersect_ray.ll | 367 ++++++- .../CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll | 2 + llvm/test/CodeGen/AMDGPU/mesa3d.ll | 7 +- llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll | 175 ++++ llvm/test/CodeGen/AMDGPU/scalar-float-sop1.ll | 2 + llvm/test/CodeGen/AMDGPU/scalar-float-sop2.ll | 122 ++- llvm/test/CodeGen/AMDGPU/scalar-float-sopc.ll | 2 + .../CodeGen/AMDGPU/swizzle.bit.extract.ll | 26 + llvm/test/CodeGen/AMDGPU/waitcnt-overflow.mir | 378 ++++++++ llvm/test/MC/AMDGPU/gfx12_asm_features.s | 6 + 19 files changed, 3646 insertions(+), 55 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll create mode 100644 llvm/test/CodeGen/AMDGPU/swizzle.bit.extract.ll diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll index dc9e1f2443830..d62da6921b347 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll @@ -4,6 +4,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12 %s ; FIXME: Merge with other test. DS offset folding doesn't work due to ; register bank copies, and no return optimization is missing. @@ -85,6 +86,18 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: lds_atomic_inc_ret_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s2 +; GFX12-NEXT: ds_inc_rtn_u32 v0, v0, v1 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_endpgm %result = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i32 42 syncscope("agent") seq_cst, align 4 store i32 %result, ptr addrspace(1) %out, align 4 ret void @@ -162,6 +175,18 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: lds_atomic_inc_ret_i32_offset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: ds_inc_rtn_u32 v0, v1, v0 offset:16 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4 %result = atomicrmw uinc_wrap ptr addrspace(3) %gep, i32 42 syncscope("agent") seq_cst, align 4 store i32 %result, ptr addrspace(1) %out, align 4 @@ -221,6 +246,16 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32(ptr addrspace(3) %ptr) #1 { ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: lds_atomic_inc_noret_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s0 +; GFX12-NEXT: ds_inc_u32 v0, v1 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm %result = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i32 42 syncscope("agent") seq_cst, align 4 ret void } @@ -278,6 +313,16 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(ptr addrspace(3) %ptr ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: lds_atomic_inc_noret_i32_offset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, s0 +; GFX12-NEXT: ds_inc_u32 v1, v0 offset:16 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4 %result = atomicrmw uinc_wrap ptr addrspace(3) %gep, i32 42 syncscope("agent") seq_cst, align 4 ret void @@ -350,6 +395,17 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_atomic_inc_ret_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_atomic_inc_u32 v0, v1, v0, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_endpgm %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 42 syncscope("agent") seq_cst, align 4 store i32 %result, ptr addrspace(1) %out, align 4 ret void @@ -426,6 +482,17 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_atomic_inc_ret_i32_offset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_atomic_inc_u32 v0, v1, v0, s[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4 store i32 %result, ptr addrspace(1) %out, align 4 @@ -503,6 +570,18 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_atomic_inc_ret_i32_offset_sistem: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_atomic_inc_u32 v0, v1, v0, s[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4 store i32 %result, ptr addrspace(1) %out, align 4 @@ -567,6 +646,16 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_atomic_inc_noret_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_atomic_inc_u32 v1, v0, s[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 42 syncscope("agent") seq_cst, align 4 ret void } @@ -633,6 +722,16 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) % ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_atomic_inc_noret_i32_offset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_atomic_inc_u32 v1, v0, s[0:1] offset:16 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4 ret void @@ -700,6 +799,17 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_atomic_inc_noret_i32_offset_system: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_atomic_inc_u32 v1, v0, s[0:1] offset:16 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4 ret void @@ -788,6 +898,19 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_atomic_inc_ret_i32_offset_addr64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_atomic_inc_u32 v1, v0, v1, s[2:3] offset:20 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, ptr addrspace(1) %ptr, i32 %id %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id @@ -867,6 +990,18 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_atomic_inc_noret_i32_offset_addr64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_atomic_inc_u32 v0, v1, s[0:1] offset:20 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, ptr addrspace(1) %ptr, i32 %id %gep = getelementptr i32, ptr addrspace(1) %gep.tid, i32 5 @@ -956,6 +1091,23 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out, ; GFX11-NEXT: global_store_b32 v2, v0, s[2:3] ; GFX11-NEXT: global_store_b32 v2, v1, s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_inc_shl_base_lds_0_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v2, 9 :: v_dual_lshlrev_b32 v1, 2, v0 +; GFX12-NEXT: v_add_nc_u32_e32 v0, 2, v0 +; GFX12-NEXT: ds_inc_rtn_u32 v1, v1, v2 offset:8 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b32 v2, v0, s[2:3] +; GFX12-NEXT: global_store_b32 v2, v1, s[0:1] +; GFX12-NEXT: s_endpgm %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #2 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x i32], ptr addrspace(3) @lds0, i32 0, i32 %idx.0 @@ -1042,6 +1194,19 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: lds_atomic_inc_ret_i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, 42 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 +; GFX12-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_endpgm %result = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i64 42 syncscope("agent") seq_cst, align 8 store i64 %result, ptr addrspace(1) %out, align 4 ret void @@ -1124,6 +1289,19 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: lds_atomic_inc_ret_i64_offset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, 42 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 +; GFX12-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4 %result = atomicrmw uinc_wrap ptr addrspace(3) %gep, i64 42 syncscope("agent") seq_cst, align 8 store i64 %result, ptr addrspace(1) %out, align 4 @@ -1188,6 +1366,17 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64(ptr addrspace(3) %ptr) #1 { ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: lds_atomic_inc_noret_i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, 42 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: ds_inc_u64 v2, v[0:1] +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm %result = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i64 42 syncscope("agent") seq_cst, align 8 ret void } @@ -1250,6 +1439,17 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(ptr addrspace(3) %ptr ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: lds_atomic_inc_noret_i64_offset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, 42 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: ds_inc_u64 v2, v[0:1] offset:32 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4 %result = atomicrmw uinc_wrap ptr addrspace(3) %gep, i64 42 syncscope("agent") seq_cst, align 8 ret void @@ -1327,6 +1527,18 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_atomic_inc_ret_i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, 42 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_atomic_inc_u64 v[0:1], v2, v[0:1], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_endpgm %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 42 syncscope("agent") seq_cst, align 8 store i64 %result, ptr addrspace(1) %out, align 4 ret void @@ -1408,6 +1620,18 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_atomic_inc_ret_i64_offset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, 42 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_atomic_inc_u64 v[0:1], v2, v[0:1], s[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8 store i64 %result, ptr addrspace(1) %out, align 4 @@ -1490,6 +1714,19 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_atomic_inc_ret_i64_offset_system: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, 42 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_atomic_inc_u64 v[0:1], v2, v[0:1], s[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8 store i64 %result, ptr addrspace(1) %out, align 4 @@ -1559,6 +1796,17 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_atomic_inc_noret_i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, 42 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 42 syncscope("agent") seq_cst, align 8 ret void } @@ -1630,6 +1878,17 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) % ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_atomic_inc_noret_i64_offset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, 42 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8 ret void @@ -1702,6 +1961,18 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_atomic_inc_noret_i64_offset_system: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, 42 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8 ret void @@ -1795,6 +2066,20 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_atomic_inc_ret_i64_offset_addr64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX12-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, 42 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 3, v2 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_atomic_inc_u64 v[0:1], v2, v[0:1], s[2:3] offset:40 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, ptr addrspace(1) %ptr, i32 %id %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id @@ -1879,6 +2164,19 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_atomic_inc_noret_i64_offset_addr64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, 42 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 3, v2 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:40 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, ptr addrspace(1) %ptr, i32 %id %gep = getelementptr i64, ptr addrspace(1) %gep.tid, i32 5 @@ -1961,6 +2259,19 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 { ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: flat_atomic_inc_ret_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v2, 42 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: flat_atomic_inc_u32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-NEXT: s_endpgm %result = atomicrmw uinc_wrap ptr %ptr, i32 42 syncscope("agent") seq_cst, align 4 store i32 %result, ptr %out, align 4 ret void @@ -2047,6 +2358,19 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: flat_atomic_inc_ret_i32_offset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v2, 42 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: flat_atomic_inc_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-NEXT: s_endpgm %gep = getelementptr i32, ptr %ptr, i32 4 %result = atomicrmw uinc_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4 store i32 %result, ptr %out, align 4 @@ -2134,6 +2458,20 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr % ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: flat_atomic_inc_ret_i32_offset_system: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v2, 42 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_inc_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-NEXT: s_endpgm %gep = getelementptr i32, ptr %ptr, i32 4 %result = atomicrmw uinc_wrap ptr %gep, i32 42 seq_cst, align 4 store i32 %result, ptr %out, align 4 @@ -2203,6 +2541,17 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 { ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: flat_atomic_inc_noret_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v2, 42 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: flat_atomic_inc_u32 v[0:1], v2 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %result = atomicrmw uinc_wrap ptr %ptr, i32 42 syncscope("agent") seq_cst, align 4 ret void } @@ -2276,6 +2625,17 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 { ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: flat_atomic_inc_noret_i32_offset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v2, 42 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: flat_atomic_inc_u32 v[0:1], v2 offset:16 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %gep = getelementptr i32, ptr %ptr, i32 4 %result = atomicrmw uinc_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4 ret void @@ -2350,6 +2710,18 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: flat_atomic_inc_noret_i32_offset_system: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v2, 42 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_inc_u32 v[0:1], v2 offset:16 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: s_endpgm %gep = getelementptr i32, ptr %ptr, i32 4 %result = atomicrmw uinc_wrap ptr %gep, i32 42 seq_cst, align 4 ret void @@ -2464,6 +2836,27 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: flat_store_b32 v[0:1], v3 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: flat_atomic_inc_ret_i32_offset_addr64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX12-NEXT: v_dual_mov_b32 v3, 42 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: flat_atomic_inc_u32 v3, v[0:1], v3 offset:20 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: flat_store_b32 v[0:1], v3 +; GFX12-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, ptr %ptr, i32 %id %out.gep = getelementptr i32, ptr %out, i32 %id @@ -2560,6 +2953,23 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: flat_atomic_inc_noret_i32_offset_addr64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: v_mov_b32_e32 v2, 42 +; GFX12-NEXT: flat_atomic_inc_u32 v[0:1], v2 offset:20 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, ptr %ptr, i32 %id %gep = getelementptr i32, ptr %gep.tid, i32 5 @@ -2655,6 +3065,25 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out, ; GFX11-NEXT: global_store_b32 v3, v2, s[2:3] ; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_inc_shl_base_lds_0_i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX12-NEXT: v_mov_b32_e32 v0, 9 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v2 +; GFX12-NEXT: v_add_nc_u32_e32 v2, 2, v2 +; GFX12-NEXT: ds_inc_rtn_u64 v[0:1], v3, v[0:1] offset:16 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_mov_b32_e32 v3, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b32 v3, v2, s[2:3] +; GFX12-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX12-NEXT: s_endpgm %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #2 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x i64], ptr addrspace(3) @lds1, i32 0, i32 %idx.0 @@ -2754,6 +3183,20 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 { ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: flat_atomic_inc_ret_i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, 42 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 +; GFX12-NEXT: v_mov_b32_e32 v3, s3 +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm %result = atomicrmw uinc_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 store i64 %result, ptr %out, align 4 ret void @@ -2855,6 +3298,20 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: flat_atomic_inc_ret_i64_offset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, 42 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 +; GFX12-NEXT: v_mov_b32_e32 v3, s3 +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm %gep = getelementptr i64, ptr %ptr, i32 4 %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 store i64 %result, ptr %out, align 4 @@ -2957,6 +3414,21 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr % ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: flat_atomic_inc_ret_i64_offset_system: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, 42 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 +; GFX12-NEXT: v_mov_b32_e32 v3, s3 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm %gep = getelementptr i64, ptr %ptr, i32 4 %result = atomicrmw uinc_wrap ptr %gep, i64 42 seq_cst, align 8, !noalias.addrspace !0 store i64 %result, ptr %out, align 4 @@ -3031,6 +3503,18 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: flat_atomic_inc_noret_i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, 42 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %result = atomicrmw uinc_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 ret void } @@ -3109,6 +3593,18 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: flat_atomic_inc_noret_i64_offset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, 42 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %gep = getelementptr i64, ptr %ptr, i32 4 %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 ret void @@ -3188,6 +3684,19 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: flat_atomic_inc_noret_i64_offset_system: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, 42 +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: s_endpgm %gep = getelementptr i64, ptr %ptr, i32 4 %result = atomicrmw uinc_wrap ptr %gep, i64 42 seq_cst, align 8, !noalias.addrspace !0 ret void @@ -3313,6 +3822,28 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: flat_atomic_inc_ret_i64_offset_addr64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: v_mov_b32_e32 v2, 42 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v4, 3, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] offset:40 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, ptr %ptr, i32 %id %out.gep = getelementptr i64, ptr %out, i32 %id @@ -3413,6 +3944,23 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: flat_atomic_inc_noret_i64_offset_addr64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-NEXT: v_mov_b32_e32 v2, 42 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v4, 3, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] offset:40 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, ptr %ptr, i32 %id %gep = getelementptr i64, ptr %gep.tid, i32 5 @@ -3514,6 +4062,25 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0, ; GFX11-NEXT: global_store_b32 v1, v2, s[0:1] ; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: nocse_lds_atomic_inc_ret_i32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x10 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s0 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX12-NEXT: ds_inc_rtn_u32 v2, v0, v1 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: ds_inc_rtn_u32 v0, v0, v1 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b32 v1, v2, s[0:1] +; GFX12-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX12-NEXT: s_endpgm %result0 = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i32 42 syncscope("agent") seq_cst, align 4 %result1 = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i32 42 syncscope("agent") seq_cst, align 4 store i32 %result0, ptr addrspace(1) %out0, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-scalar-float-sop1.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-scalar-float-sop1.mir index ae4e5feb0d744..130f87e44eac7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-scalar-float-sop1.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-scalar-float-sop1.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX1150 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX1150 %s --- name: sitofp_i32_to_f32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-scalar-float-sop2.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-scalar-float-sop2.mir index dac85561208d4..d80a13c4d7c79 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-scalar-float-sop2.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-scalar-float-sop2.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX1150 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX1150 %s --- name: fadd_f32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-scalar-float-sopc.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-scalar-float-sopc.mir index e065e09766dd9..c75a2926e7cf9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-scalar-float-sopc.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-scalar-float-sopc.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX1150 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX1150 %s --- name: f32_olt diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll index 2215df9cef262..1813003181d45 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s define amdgpu_ps void @v_interp_f32(float inreg %i, float inreg %j, i32 inreg %m0) #0 { ; GFX11-LABEL: v_interp_f32: @@ -21,6 +23,25 @@ define amdgpu_ps void @v_interp_f32(float inreg %i, float inreg %j, i32 inreg %m ; GFX11-NEXT: v_interp_p2_f32 v4, v1, v4, v5 wait_exp:7 ; GFX11-NEXT: exp mrt0 v3, v2, v5, v4 done ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_interp_f32: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_mov_b32 s3, exec_lo +; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX12-NEXT: s_mov_b32 m0, s2 +; GFX12-NEXT: ds_param_load v0, attr0.y wait_va_vdst:15 wait_vm_vsrc:1 +; GFX12-NEXT: ds_param_load v1, attr1.x wait_va_vdst:15 wait_vm_vsrc:1 +; GFX12-NEXT: s_mov_b32 exec_lo, s3 +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: v_mov_b32_e32 v4, s1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_interp_p10_f32 v3, v0, v2, v0 wait_exp:1 +; GFX12-NEXT: v_interp_p10_f32 v2, v1, v2, v1 wait_exp:0 +; GFX12-NEXT: v_interp_p2_f32 v5, v0, v4, v3 wait_exp:7 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_interp_p2_f32 v4, v1, v4, v5 wait_exp:7 +; GFX12-NEXT: export mrt0 v3, v2, v5, v4 done +; GFX12-NEXT: s_endpgm main_body: %p0 = call float @llvm.amdgcn.lds.param.load(i32 1, i32 0, i32 %m0) %p1 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 1, i32 %m0) @@ -57,6 +78,31 @@ define amdgpu_ps void @v_interp_f32_many(float inreg %i, float inreg %j, i32 inr ; GFX11-NEXT: v_interp_p2_f32 v4, v3, v5, v4 wait_exp:7 ; GFX11-NEXT: exp mrt0 v6, v7, v8, v4 done ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_interp_f32_many: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_mov_b32 s3, exec_lo +; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX12-NEXT: s_mov_b32 m0, s2 +; GFX12-NEXT: ds_param_load v0, attr0.x wait_va_vdst:15 wait_vm_vsrc:1 +; GFX12-NEXT: ds_param_load v1, attr1.x wait_va_vdst:15 wait_vm_vsrc:1 +; GFX12-NEXT: ds_param_load v2, attr2.x wait_va_vdst:15 wait_vm_vsrc:1 +; GFX12-NEXT: ds_param_load v3, attr3.x wait_va_vdst:15 wait_vm_vsrc:1 +; GFX12-NEXT: s_mov_b32 exec_lo, s3 +; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_interp_p10_f32 v6, v0, v4, v0 wait_exp:3 +; GFX12-NEXT: v_interp_p10_f32 v7, v1, v4, v1 wait_exp:2 +; GFX12-NEXT: v_interp_p10_f32 v8, v2, v4, v2 wait_exp:1 +; GFX12-NEXT: v_interp_p10_f32 v4, v3, v4, v3 wait_exp:0 +; GFX12-NEXT: v_interp_p2_f32 v6, v0, v5, v6 wait_exp:7 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_interp_p2_f32 v7, v1, v5, v7 wait_exp:7 +; GFX12-NEXT: v_interp_p2_f32 v8, v2, v5, v8 wait_exp:7 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-NEXT: v_interp_p2_f32 v4, v3, v5, v4 wait_exp:7 +; GFX12-NEXT: export mrt0 v6, v7, v8, v4 done +; GFX12-NEXT: s_endpgm main_body: %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0) %p1 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 1, i32 %m0) @@ -99,6 +145,31 @@ define amdgpu_ps void @v_interp_f32_many_vm(ptr addrspace(1) %ptr, i32 inreg %m0 ; GFX11-NEXT: v_interp_p2_f32 v0, v5, v1, v0 wait_exp:7 ; GFX11-NEXT: exp mrt0 v6, v7, v8, v0 done ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_interp_f32_many_vm: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: global_load_b64 v[0:1], v[0:1], off offset:4 +; GFX12-NEXT: s_mov_b32 m0, s0 +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX12-NEXT: ds_param_load v2, attr0.x wait_va_vdst:15 wait_vm_vsrc:1 +; GFX12-NEXT: ds_param_load v3, attr1.x wait_va_vdst:15 wait_vm_vsrc:1 +; GFX12-NEXT: ds_param_load v4, attr2.x wait_va_vdst:15 wait_vm_vsrc:1 +; GFX12-NEXT: ds_param_load v5, attr3.x wait_va_vdst:15 wait_vm_vsrc:1 +; GFX12-NEXT: s_mov_b32 exec_lo, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_interp_p10_f32 v6, v2, v0, v2 wait_exp:3 +; GFX12-NEXT: v_interp_p10_f32 v7, v3, v0, v3 wait_exp:2 +; GFX12-NEXT: v_interp_p10_f32 v8, v4, v0, v4 wait_exp:1 +; GFX12-NEXT: v_interp_p10_f32 v0, v5, v0, v5 wait_exp:0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_interp_p2_f32 v6, v2, v1, v6 wait_exp:7 +; GFX12-NEXT: v_interp_p2_f32 v7, v3, v1, v7 wait_exp:7 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_interp_p2_f32 v8, v4, v1, v8 wait_exp:7 +; GFX12-NEXT: v_interp_p2_f32 v0, v5, v1, v0 wait_exp:7 +; GFX12-NEXT: export mrt0 v6, v7, v8, v0 done +; GFX12-NEXT: s_endpgm main_body: %i.ptr = getelementptr float, ptr addrspace(1) %ptr, i32 1 %i = load float, ptr addrspace(1) %i.ptr, align 4 @@ -156,6 +227,42 @@ define amdgpu_ps half @v_interp_f16(float inreg %i, float inreg %j, i32 inreg %m ; GFX11-FAKE16-NEXT: v_interp_p2_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7 ; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, v3, v0 ; GFX11-FAKE16-NEXT: ; return to shader part epilog +; +; GFX12-TRUE16-LABEL: v_interp_f16: +; GFX12-TRUE16: ; %bb.0: ; %main_body +; GFX12-TRUE16-NEXT: s_mov_b32 s3, exec_lo +; GFX12-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX12-TRUE16-NEXT: s_mov_b32 m0, s2 +; GFX12-TRUE16-NEXT: ds_param_load v1, attr0.x wait_va_vdst:15 wait_vm_vsrc:1 +; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_interp_p10_f16_f32 v3, v1.l, v0, v1.l wait_exp:0 +; GFX12-TRUE16-NEXT: v_interp_p10_f16_f32 v4, v1.h, v0, v1.h wait_exp:7 +; GFX12-TRUE16-NEXT: v_interp_p2_f16_f32 v0.l, v1.l, v2, v3 wait_exp:7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_interp_p2_f16_f32 v0.h, v1.h, v2, v4 wait_exp:7 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX12-TRUE16-NEXT: ; return to shader part epilog +; +; GFX12-FAKE16-LABEL: v_interp_f16: +; GFX12-FAKE16: ; %bb.0: ; %main_body +; GFX12-FAKE16-NEXT: s_mov_b32 s3, exec_lo +; GFX12-FAKE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX12-FAKE16-NEXT: s_mov_b32 m0, s2 +; GFX12-FAKE16-NEXT: ds_param_load v1, attr0.x wait_va_vdst:15 wait_vm_vsrc:1 +; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_interp_p10_f16_f32 v3, v1, v0, v1 wait_exp:0 +; GFX12-FAKE16-NEXT: v_interp_p10_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7 +; GFX12-FAKE16-NEXT: v_interp_p2_f16_f32 v3, v1, v2, v3 wait_exp:7 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_interp_p2_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7 +; GFX12-FAKE16-NEXT: v_add_f16_e32 v0, v3, v0 +; GFX12-FAKE16-NEXT: ; return to shader part epilog main_body: %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0) %l_p0 = call float @llvm.amdgcn.interp.inreg.p10.f16(float %p0, float %i, float %p0, i1 0) @@ -202,6 +309,42 @@ define amdgpu_ps half @v_interp_rtz_f16(float inreg %i, float inreg %j, i32 inre ; GFX11-FAKE16-NEXT: v_interp_p2_rtz_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7 ; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, v3, v0 ; GFX11-FAKE16-NEXT: ; return to shader part epilog +; +; GFX12-TRUE16-LABEL: v_interp_rtz_f16: +; GFX12-TRUE16: ; %bb.0: ; %main_body +; GFX12-TRUE16-NEXT: s_mov_b32 s3, exec_lo +; GFX12-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX12-TRUE16-NEXT: s_mov_b32 m0, s2 +; GFX12-TRUE16-NEXT: ds_param_load v1, attr0.x wait_va_vdst:15 wait_vm_vsrc:1 +; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_interp_p10_rtz_f16_f32 v3, v1.l, v0, v1.l wait_exp:0 +; GFX12-TRUE16-NEXT: v_interp_p10_rtz_f16_f32 v4, v1.h, v0, v1.h wait_exp:7 +; GFX12-TRUE16-NEXT: v_interp_p2_rtz_f16_f32 v0.l, v1.l, v2, v3 wait_exp:7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_interp_p2_rtz_f16_f32 v0.h, v1.h, v2, v4 wait_exp:7 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h +; GFX12-TRUE16-NEXT: ; return to shader part epilog +; +; GFX12-FAKE16-LABEL: v_interp_rtz_f16: +; GFX12-FAKE16: ; %bb.0: ; %main_body +; GFX12-FAKE16-NEXT: s_mov_b32 s3, exec_lo +; GFX12-FAKE16-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX12-FAKE16-NEXT: s_mov_b32 m0, s2 +; GFX12-FAKE16-NEXT: ds_param_load v1, attr0.x wait_va_vdst:15 wait_vm_vsrc:1 +; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_interp_p10_rtz_f16_f32 v3, v1, v0, v1 wait_exp:0 +; GFX12-FAKE16-NEXT: v_interp_p10_rtz_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7 +; GFX12-FAKE16-NEXT: v_interp_p2_rtz_f16_f32 v3, v1, v2, v3 wait_exp:7 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_interp_p2_rtz_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7 +; GFX12-FAKE16-NEXT: v_add_f16_e32 v0, v3, v0 +; GFX12-FAKE16-NEXT: ; return to shader part epilog main_body: %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0) %l_p0 = call float @llvm.amdgcn.interp.p10.rtz.f16(float %p0, float %i, float %p0, i1 0) @@ -237,6 +380,31 @@ define amdgpu_ps half @v_interp_f16_imm_params(float inreg %i, float inreg %j) # ; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, v1, v0 ; GFX11-FAKE16-NEXT: ; return to shader part epilog +; +; GFX12-TRUE16-LABEL: v_interp_f16_imm_params: +; GFX12-TRUE16: ; %bb.0: ; %main_body +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, s1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_interp_p10_f16_f32 v1, v0.l, v1, v0.l wait_exp:7 +; GFX12-TRUE16-NEXT: v_interp_p2_f16_f32 v0.l, v0.l, v3, v2 wait_exp:7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: ; return to shader part epilog +; +; GFX12-FAKE16-LABEL: v_interp_f16_imm_params: +; GFX12-FAKE16: ; %bb.0: ; %main_body +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_interp_p10_f16_f32 v1, v0, v1, v0 wait_exp:7 +; GFX12-FAKE16-NEXT: v_interp_p2_f16_f32 v0, v0, v2, v0 wait_exp:7 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX12-FAKE16-NEXT: v_add_f16_e32 v0, v1, v0 +; GFX12-FAKE16-NEXT: ; return to shader part epilog main_body: %l_p0 = call float @llvm.amdgcn.interp.inreg.p10.f16(float 0.0, float %i, float 0.0, i1 0) %l_p1 = call half @llvm.amdgcn.interp.inreg.p2.f16(float 0.0, float %j, float 0.0, i1 0) diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll index f0fa621e3b4bc..64838a98f34d2 100644 --- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll @@ -3766,3 +3766,69 @@ define <2 x double> @v_no_fmaximum3_f64__multi_use(double %a, double %b, double %insert.1 = insertelement <2 x double> %insert.0, double %max1, i32 1 ret <2 x double> %insert.1 } + +; Checks whether the test passes; performMinMaxCombine() should not optimize vector patterns of maximum3 +; since there are no pack instructions for fmaximum3. +define <2 x half> @no_fmaximum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) { +; GFX12-LABEL: no_fmaximum3_v2f16: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_maximum_f16 v0, v2, v0 +; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: no_fmaximum3_v2f16: +; GFX940: ; %bb.0: ; %entry +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_pk_max_f16 v4, v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX940-NEXT: s_mov_b32 s0, 0x5040100 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX940-NEXT: v_perm_b32 v1, v0, v6, s0 +; GFX940-NEXT: v_pk_max_f16 v1, v2, v1 +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v2, v6 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v1, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v2, v0 src0_sel:WORD_1 src1_sel:DWORD +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc +; GFX940-NEXT: v_perm_b32 v1, v0, v4, s0 +; GFX940-NEXT: v_pk_max_f16 v1, v1, v3 +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v4, v3 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v2, v5, v1, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v3 src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc +; GFX940-NEXT: v_perm_b32 v0, v0, v2, s0 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: no_fmaximum3_v2f16: +; GFX950: ; %bb.0: ; %entry +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v2, v0, v0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v3, v3 +; GFX950-NEXT: s_setpc_b64 s[30:31] +entry: + %max = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b) + %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %c, <2 x half> %max) + %res = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max1, <2 x half> %d) + ret <2 x half> %res +} diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll index 6b4a6381d954c..bfd57aebad521 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll @@ -2,6 +2,7 @@ ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10 %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12 %s ; Test using saddr addressing mode of global_* flat atomic instructions. @@ -85,6 +86,34 @@ define amdgpu_ps float @global_max_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_max_saddr_i32_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v2, v0 +; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_i32_e32 v4, v5, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB0_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw max ptr addrspace(1) %gep0, i32 %data seq_cst @@ -168,6 +197,34 @@ define amdgpu_ps float @global_max_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_max_saddr_i32_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v2, v0 +; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 +; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_i32_e32 v4, v5, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB1_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -245,6 +302,31 @@ define amdgpu_ps void @global_max_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: s_cbranch_execnz .LBB2_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_max_saddr_i32_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b32 v5, v0, s[2:3] +; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_i32_e32 v4, v5, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB2_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw max ptr addrspace(1) %gep0, i32 %data seq_cst @@ -320,6 +402,31 @@ define amdgpu_ps void @global_max_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: s_cbranch_execnz .LBB3_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_max_saddr_i32_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b32 v5, v0, s[2:3] offset:-128 +; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_i32_e32 v4, v5, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB3_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -415,6 +522,38 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, v4 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_max_saddr_i64_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[3:4], v0, s[2:3] +; GFX12-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v10, v4 +; GFX12-NEXT: v_mov_b32_e32 v9, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2] +; GFX12-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc +; GFX12-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB4_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw max ptr addrspace(1) %gep0, i64 %data seq_cst @@ -510,6 +649,38 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, v4 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_max_saddr_i64_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[3:4], v0, s[2:3] offset:-128 +; GFX12-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v10, v4 +; GFX12-NEXT: v_mov_b32_e32 v9, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2] +; GFX12-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc +; GFX12-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB5_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -596,6 +767,34 @@ define amdgpu_ps void @global_max_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: s_cbranch_execnz .LBB6_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_max_saddr_i64_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[5:6], v0, s[2:3] +; GFX12-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2] +; GFX12-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX12-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] +; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: v_mov_b32_e32 v5, v3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB6_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw max ptr addrspace(1) %gep0, i64 %data seq_cst @@ -680,6 +879,34 @@ define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: s_cbranch_execnz .LBB7_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_max_saddr_i64_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[5:6], v0, s[2:3] offset:-128 +; GFX12-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2] +; GFX12-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX12-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] +; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: v_mov_b32_e32 v5, v3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB7_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -767,6 +994,34 @@ define amdgpu_ps float @global_min_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_min_saddr_i32_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v2, v0 +; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_i32_e32 v4, v5, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB8_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw min ptr addrspace(1) %gep0, i32 %data seq_cst @@ -850,6 +1105,34 @@ define amdgpu_ps float @global_min_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_min_saddr_i32_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v2, v0 +; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 +; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_i32_e32 v4, v5, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB9_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -927,6 +1210,31 @@ define amdgpu_ps void @global_min_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: s_cbranch_execnz .LBB10_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_min_saddr_i32_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b32 v5, v0, s[2:3] +; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_min_i32_e32 v4, v5, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB10_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw min ptr addrspace(1) %gep0, i32 %data seq_cst @@ -1002,6 +1310,31 @@ define amdgpu_ps void @global_min_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: s_cbranch_execnz .LBB11_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_min_saddr_i32_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b32 v5, v0, s[2:3] offset:-128 +; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_min_i32_e32 v4, v5, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB11_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1097,6 +1430,38 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, v4 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_min_saddr_i64_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[3:4], v0, s[2:3] +; GFX12-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v10, v4 +; GFX12-NEXT: v_mov_b32_e32 v9, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2] +; GFX12-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc +; GFX12-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB12_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw min ptr addrspace(1) %gep0, i64 %data seq_cst @@ -1192,6 +1557,38 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, v4 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_min_saddr_i64_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[3:4], v0, s[2:3] offset:-128 +; GFX12-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v10, v4 +; GFX12-NEXT: v_mov_b32_e32 v9, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2] +; GFX12-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc +; GFX12-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB13_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1278,6 +1675,34 @@ define amdgpu_ps void @global_min_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: s_cbranch_execnz .LBB14_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_min_saddr_i64_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[5:6], v0, s[2:3] +; GFX12-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2] +; GFX12-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX12-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] +; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: v_mov_b32_e32 v5, v3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB14_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw min ptr addrspace(1) %gep0, i64 %data seq_cst @@ -1362,6 +1787,34 @@ define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: s_cbranch_execnz .LBB15_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_min_saddr_i64_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[5:6], v0, s[2:3] offset:-128 +; GFX12-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2] +; GFX12-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX12-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] +; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: v_mov_b32_e32 v5, v3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB15_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1449,6 +1902,34 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_umax_saddr_i32_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v2, v0 +; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_u32_e32 v4, v5, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB16_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw umax ptr addrspace(1) %gep0, i32 %data seq_cst @@ -1532,6 +2013,34 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_umax_saddr_i32_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v2, v0 +; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 +; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_u32_e32 v4, v5, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB17_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1609,6 +2118,31 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn(ptr addrspace(1) inreg %sbase ; GFX11-NEXT: s_cbranch_execnz .LBB18_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_umax_saddr_i32_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b32 v5, v0, s[2:3] +; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_u32_e32 v4, v5, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB18_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw umax ptr addrspace(1) %gep0, i32 %data seq_cst @@ -1684,6 +2218,31 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: s_cbranch_execnz .LBB19_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_umax_saddr_i32_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b32 v5, v0, s[2:3] offset:-128 +; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_max_u32_e32 v4, v5, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB19_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1779,6 +2338,38 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(ptr addrspace(1) inreg % ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, v4 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_umax_saddr_i64_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[3:4], v0, s[2:3] +; GFX12-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v10, v4 +; GFX12-NEXT: v_mov_b32_e32 v9, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2] +; GFX12-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc +; GFX12-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB20_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw umax ptr addrspace(1) %gep0, i64 %data seq_cst @@ -1874,6 +2465,38 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(ptr addrspace(1) ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, v4 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_umax_saddr_i64_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[3:4], v0, s[2:3] offset:-128 +; GFX12-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v10, v4 +; GFX12-NEXT: v_mov_b32_e32 v9, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2] +; GFX12-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc +; GFX12-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB21_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1960,6 +2583,34 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn(ptr addrspace(1) inreg %sbase ; GFX11-NEXT: s_cbranch_execnz .LBB22_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_umax_saddr_i64_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[5:6], v0, s[2:3] +; GFX12-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2] +; GFX12-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX12-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] +; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: v_mov_b32_e32 v5, v3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB22_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw umax ptr addrspace(1) %gep0, i64 %data seq_cst @@ -2044,6 +2695,34 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: s_cbranch_execnz .LBB23_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_umax_saddr_i64_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[5:6], v0, s[2:3] offset:-128 +; GFX12-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2] +; GFX12-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX12-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] +; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: v_mov_b32_e32 v5, v3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB23_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2131,6 +2810,34 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_umin_saddr_i32_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v2, v0 +; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_u32_e32 v4, v5, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB24_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw umin ptr addrspace(1) %gep0, i32 %data seq_cst @@ -2214,6 +2921,34 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_umin_saddr_i32_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v2, v0 +; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 +; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_u32_e32 v4, v5, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB25_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2291,6 +3026,31 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn(ptr addrspace(1) inreg %sbase ; GFX11-NEXT: s_cbranch_execnz .LBB26_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_umin_saddr_i32_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b32 v5, v0, s[2:3] +; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_min_u32_e32 v4, v5, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB26_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw umin ptr addrspace(1) %gep0, i32 %data seq_cst @@ -2366,6 +3126,31 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: s_cbranch_execnz .LBB27_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_umin_saddr_i32_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b32 v5, v0, s[2:3] offset:-128 +; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_min_u32_e32 v4, v5, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB27_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2461,6 +3246,38 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(ptr addrspace(1) inreg % ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, v4 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_umin_saddr_i64_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[3:4], v0, s[2:3] +; GFX12-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v10, v4 +; GFX12-NEXT: v_mov_b32_e32 v9, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2] +; GFX12-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc +; GFX12-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB28_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw umin ptr addrspace(1) %gep0, i64 %data seq_cst @@ -2556,6 +3373,38 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(ptr addrspace(1) ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, v4 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_umin_saddr_i64_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[3:4], v0, s[2:3] offset:-128 +; GFX12-NEXT: v_add_co_u32 v5, s[0:1], s2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v10, v4 +; GFX12-NEXT: v_mov_b32_e32 v9, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2] +; GFX12-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc +; GFX12-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB29_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2642,6 +3491,34 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn(ptr addrspace(1) inreg %sbase ; GFX11-NEXT: s_cbranch_execnz .LBB30_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_umin_saddr_i64_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[5:6], v0, s[2:3] +; GFX12-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2] +; GFX12-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX12-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] +; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: v_mov_b32_e32 v5, v3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB30_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw umin ptr addrspace(1) %gep0, i64 %data seq_cst @@ -2726,6 +3603,34 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: s_cbranch_execnz .LBB31_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_umin_saddr_i64_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[5:6], v0, s[2:3] offset:-128 +; GFX12-NEXT: v_add_co_u32 v7, s[0:1], s2, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1] +; GFX12-NEXT: s_mov_b64 s[0:1], 0 +; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2] +; GFX12-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX12-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] +; GFX12-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-NEXT: v_mov_b32_e32 v5, v3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX12-NEXT: s_cbranch_execnz .LBB31_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll index fb72dcacee4cf..a7225a104ff32 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll @@ -2,6 +2,7 @@ ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX10 %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12 %s ; Test using saddr addressing mode of global_* flat atomic instructions. @@ -28,6 +29,13 @@ define amdgpu_ps void @global_xchg_saddr_i32_nortn(ptr addrspace(1) inreg %sbase ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_xchg_saddr_i32_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_swap_b32 v0, v1, s[2:3] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw xchg ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst @@ -58,6 +66,13 @@ define amdgpu_ps void @global_xchg_saddr_i32_nortn_offset_2047(ptr addrspace(1) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_xchg_saddr_i32_nortn_offset_2047: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_swap_b32 v0, v1, s[2:3] offset:2047 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 2047 @@ -89,6 +104,13 @@ define amdgpu_ps void @global_xchg_saddr_i32_nortn_offset_neg2048(ptr addrspace( ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_xchg_saddr_i32_nortn_offset_neg2048: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_swap_b32 v0, v1, s[2:3] offset:-2048 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -2048 @@ -119,6 +141,13 @@ define amdgpu_ps float @global_xchg_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_xchg_saddr_i32_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_swap_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw xchg ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst @@ -153,6 +182,13 @@ define amdgpu_ps float @global_xchg_saddr_i32_rtn_2048(ptr addrspace(1) inreg %s ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_xchg_saddr_i32_rtn_2048: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_swap_b32 v0, v0, v1, s[2:3] offset:2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 2048 @@ -184,6 +220,13 @@ define amdgpu_ps float @global_xchg_saddr_i32_rtn_neg2048(ptr addrspace(1) inreg ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_xchg_saddr_i32_rtn_neg2048: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_swap_b32 v0, v0, v1, s[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -2048 @@ -238,6 +281,18 @@ define amdgpu_ps float @global_xchg_saddr_uniform_ptr_in_vgprs_rtn(i32 %voffset, ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: ds_load_b64 v[2:3], v2 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s0, v2 +; GFX12-NEXT: v_readfirstlane_b32 s1, v3 +; GFX12-NEXT: global_atomic_swap_b32 v0, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -286,6 +341,18 @@ define amdgpu_ps float @global_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset(i32 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: ds_load_b64 v[2:3], v2 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s0, v2 +; GFX12-NEXT: v_readfirstlane_b32 s1, v3 +; GFX12-NEXT: global_atomic_swap_b32 v0, v0, v1, s[0:1] offset:42 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -335,6 +402,18 @@ define amdgpu_ps void @global_xchg_saddr_uniform_ptr_in_vgprs_nortn(i32 %voffset ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: ds_load_b64 v[2:3], v2 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s0, v2 +; GFX12-NEXT: v_readfirstlane_b32 s1, v3 +; GFX12-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -382,6 +461,18 @@ define amdgpu_ps void @global_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset(i3 ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: ds_load_b64 v[2:3], v2 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s0, v2 +; GFX12-NEXT: v_readfirstlane_b32 s1, v3 +; GFX12-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] offset:42 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset @@ -421,6 +512,13 @@ define amdgpu_ps <2 x float> @global_xchg_saddr_i64_rtn(ptr addrspace(1) inreg % ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_xchg_saddr_i64_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw xchg ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst @@ -451,6 +549,13 @@ define amdgpu_ps <2 x float> @global_xchg_saddr_i64_rtn_neg128(ptr addrspace(1) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_xchg_saddr_i64_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -482,6 +587,13 @@ define amdgpu_ps void @global_xchg_saddr_i64_nortn(ptr addrspace(1) inreg %sbase ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_xchg_saddr_i64_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_swap_b64 v0, v[1:2], s[2:3] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw xchg ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst @@ -511,6 +623,13 @@ define amdgpu_ps void @global_xchg_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_xchg_saddr_i64_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_swap_b64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -545,6 +664,13 @@ define amdgpu_ps float @global_add_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_add_saddr_i32_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_add_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw add ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst @@ -575,6 +701,13 @@ define amdgpu_ps float @global_add_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_add_saddr_i32_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_add_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -606,6 +739,13 @@ define amdgpu_ps void @global_add_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_add_saddr_i32_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_add_u32 v0, v1, s[2:3] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw add ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst @@ -635,6 +775,13 @@ define amdgpu_ps void @global_add_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_add_saddr_i32_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_add_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -665,6 +812,13 @@ define amdgpu_ps <2 x float> @global_add_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_add_saddr_i64_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_add_u64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw add ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst @@ -695,6 +849,13 @@ define amdgpu_ps <2 x float> @global_add_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_add_saddr_i64_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_add_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -726,6 +887,13 @@ define amdgpu_ps void @global_add_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_add_saddr_i64_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_add_u64 v0, v[1:2], s[2:3] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw add ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst @@ -755,6 +923,13 @@ define amdgpu_ps void @global_add_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_add_saddr_i64_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_add_u64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -789,6 +964,13 @@ define amdgpu_ps float @global_sub_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_sub_saddr_i32_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_sub_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw sub ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst @@ -819,6 +1001,13 @@ define amdgpu_ps float @global_sub_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_sub_saddr_i32_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_sub_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -850,6 +1039,13 @@ define amdgpu_ps void @global_sub_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_sub_saddr_i32_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_sub_u32 v0, v1, s[2:3] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw sub ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst @@ -879,6 +1075,13 @@ define amdgpu_ps void @global_sub_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_sub_saddr_i32_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_sub_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -909,6 +1112,13 @@ define amdgpu_ps <2 x float> @global_sub_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_sub_saddr_i64_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw sub ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst @@ -939,6 +1149,13 @@ define amdgpu_ps <2 x float> @global_sub_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_sub_saddr_i64_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -970,6 +1187,13 @@ define amdgpu_ps void @global_sub_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_sub_saddr_i64_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_sub_u64 v0, v[1:2], s[2:3] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw sub ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst @@ -999,6 +1223,13 @@ define amdgpu_ps void @global_sub_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_sub_saddr_i64_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_sub_u64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1033,6 +1264,13 @@ define amdgpu_ps float @global_and_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_and_saddr_i32_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_and_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw and ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst @@ -1063,6 +1301,13 @@ define amdgpu_ps float @global_and_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_and_saddr_i32_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_and_b32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1094,6 +1339,13 @@ define amdgpu_ps void @global_and_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_and_saddr_i32_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_and_b32 v0, v1, s[2:3] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw and ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst @@ -1123,6 +1375,13 @@ define amdgpu_ps void @global_and_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_and_saddr_i32_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_and_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1153,6 +1412,13 @@ define amdgpu_ps <2 x float> @global_and_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_and_saddr_i64_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_and_b64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw and ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst @@ -1183,6 +1449,13 @@ define amdgpu_ps <2 x float> @global_and_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_and_saddr_i64_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_and_b64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1214,6 +1487,13 @@ define amdgpu_ps void @global_and_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_and_saddr_i64_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_and_b64 v0, v[1:2], s[2:3] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw and ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst @@ -1243,6 +1523,13 @@ define amdgpu_ps void @global_and_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_and_saddr_i64_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_and_b64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1277,6 +1564,13 @@ define amdgpu_ps float @global_or_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_or_saddr_i32_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_or_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw or ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst @@ -1307,6 +1601,13 @@ define amdgpu_ps float @global_or_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %s ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_or_saddr_i32_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_or_b32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1338,6 +1639,13 @@ define amdgpu_ps void @global_or_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_or_saddr_i32_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_or_b32 v0, v1, s[2:3] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw or ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst @@ -1367,6 +1675,13 @@ define amdgpu_ps void @global_or_saddr_i32_nortn_neg128(ptr addrspace(1) inreg % ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_or_saddr_i32_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_or_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1397,6 +1712,13 @@ define amdgpu_ps <2 x float> @global_or_saddr_i64_rtn(ptr addrspace(1) inreg %sb ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_or_saddr_i64_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_or_b64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw or ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst @@ -1427,6 +1749,13 @@ define amdgpu_ps <2 x float> @global_or_saddr_i64_rtn_neg128(ptr addrspace(1) in ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_or_saddr_i64_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_or_b64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1458,6 +1787,13 @@ define amdgpu_ps void @global_or_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_or_saddr_i64_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_or_b64 v0, v[1:2], s[2:3] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw or ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst @@ -1487,6 +1823,13 @@ define amdgpu_ps void @global_or_saddr_i64_nortn_neg128(ptr addrspace(1) inreg % ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_or_saddr_i64_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_or_b64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1521,6 +1864,13 @@ define amdgpu_ps float @global_xor_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_xor_saddr_i32_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_xor_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw xor ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst @@ -1551,6 +1901,13 @@ define amdgpu_ps float @global_xor_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_xor_saddr_i32_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_xor_b32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1582,6 +1939,13 @@ define amdgpu_ps void @global_xor_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_xor_saddr_i32_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_xor_b32 v0, v1, s[2:3] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw xor ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst @@ -1611,6 +1975,13 @@ define amdgpu_ps void @global_xor_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_xor_saddr_i32_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_xor_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1641,6 +2012,13 @@ define amdgpu_ps <2 x float> @global_xor_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_xor_saddr_i64_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw xor ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst @@ -1671,6 +2049,13 @@ define amdgpu_ps <2 x float> @global_xor_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_xor_saddr_i64_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1702,6 +2087,13 @@ define amdgpu_ps void @global_xor_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_xor_saddr_i64_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_xor_b64 v0, v[1:2], s[2:3] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw xor ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst @@ -1731,6 +2123,13 @@ define amdgpu_ps void @global_xor_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_xor_saddr_i64_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_xor_b64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1762,6 +2161,13 @@ define amdgpu_ps float @global_max_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_max_saddr_i32_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_max_i32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw max ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst @@ -1789,6 +2195,13 @@ define amdgpu_ps float @global_max_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_max_saddr_i32_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_max_i32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1816,6 +2229,13 @@ define amdgpu_ps void @global_max_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_max_saddr_i32_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_max_i32 v0, v1, s[2:3] scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw max ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst @@ -1841,6 +2261,13 @@ define amdgpu_ps void @global_max_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_max_saddr_i32_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_max_i32 v0, v1, s[2:3] offset:-128 scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1868,6 +2295,13 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_max_saddr_i64_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_max_i64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw max ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst @@ -1895,6 +2329,13 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_max_saddr_i64_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_max_i64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1922,6 +2363,13 @@ define amdgpu_ps void @global_max_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_max_saddr_i64_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_max_i64 v0, v[1:2], s[2:3] scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw max ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst @@ -1947,6 +2395,13 @@ define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_max_saddr_i64_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_max_i64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -1978,6 +2433,13 @@ define amdgpu_ps float @global_min_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_min_saddr_i32_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_min_i32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw min ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst @@ -2005,6 +2467,13 @@ define amdgpu_ps float @global_min_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_min_saddr_i32_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_min_i32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2032,6 +2501,13 @@ define amdgpu_ps void @global_min_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_min_saddr_i32_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_min_i32 v0, v1, s[2:3] scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw min ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst @@ -2057,6 +2533,13 @@ define amdgpu_ps void @global_min_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_min_saddr_i32_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_min_i32 v0, v1, s[2:3] offset:-128 scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2084,6 +2567,13 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_min_saddr_i64_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_min_i64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw min ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst @@ -2111,6 +2601,13 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_min_saddr_i64_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_min_i64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2138,6 +2635,13 @@ define amdgpu_ps void @global_min_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_min_saddr_i64_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_min_i64 v0, v[1:2], s[2:3] scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw min ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst @@ -2163,6 +2667,13 @@ define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_min_saddr_i64_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_min_i64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2194,6 +2705,13 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_umax_saddr_i32_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_max_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw umax ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst @@ -2221,6 +2739,13 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_umax_saddr_i32_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_max_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2248,6 +2773,13 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn(ptr addrspace(1) inreg %sbase ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_umax_saddr_i32_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_max_u32 v0, v1, s[2:3] scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw umax ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst @@ -2273,6 +2805,13 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_umax_saddr_i32_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_max_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2300,6 +2839,13 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(ptr addrspace(1) inreg % ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_umax_saddr_i64_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_max_u64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw umax ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst @@ -2327,6 +2873,13 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(ptr addrspace(1) ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_umax_saddr_i64_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_max_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2354,6 +2907,13 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn(ptr addrspace(1) inreg %sbase ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_umax_saddr_i64_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_max_u64 v0, v[1:2], s[2:3] scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw umax ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst @@ -2379,6 +2939,13 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_umax_saddr_i64_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_max_u64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2410,6 +2977,13 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_umin_saddr_i32_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_min_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw umin ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst @@ -2437,6 +3011,13 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_umin_saddr_i32_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_min_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2464,6 +3045,13 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn(ptr addrspace(1) inreg %sbase ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_umin_saddr_i32_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_min_u32 v0, v1, s[2:3] scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw umin ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst @@ -2489,6 +3077,13 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_umin_saddr_i32_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_min_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2516,6 +3111,13 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(ptr addrspace(1) inreg % ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_umin_saddr_i64_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_min_u64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw umin ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst @@ -2543,6 +3145,13 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(ptr addrspace(1) ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_umin_saddr_i64_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_min_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SE +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2570,6 +3179,13 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn(ptr addrspace(1) inreg %sbase ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_umin_saddr_i64_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_min_u64 v0, v[1:2], s[2:3] scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw umin ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst @@ -2595,6 +3211,13 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_umin_saddr_i64_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_min_u64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_SE +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2632,6 +3255,15 @@ define amdgpu_ps float @global_cmpxchg_saddr_i32_rtn(ptr addrspace(1) inreg %sba ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_cmpxchg_saddr_i32_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v3, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v0, v[2:3], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %cmpxchg = cmpxchg ptr addrspace(1) %gep0, i32 %cmp, i32 %data seq_cst seq_cst @@ -2666,6 +3298,15 @@ define amdgpu_ps float @global_cmpxchg_saddr_i32_rtn_neg128(ptr addrspace(1) inr ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_cmpxchg_saddr_i32_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v3, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v0, v[2:3], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2701,6 +3342,15 @@ define amdgpu_ps void @global_cmpxchg_saddr_i32_nortn(ptr addrspace(1) inreg %sb ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_cmpxchg_saddr_i32_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v3, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[2:3] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = cmpxchg ptr addrspace(1) %gep0, i32 %cmp, i32 %data seq_cst seq_cst @@ -2733,6 +3383,15 @@ define amdgpu_ps void @global_cmpxchg_saddr_i32_nortn_neg128(ptr addrspace(1) in ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_cmpxchg_saddr_i32_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v3, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[2:3] offset:-128 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2769,6 +3428,16 @@ define amdgpu_ps <2 x float> @global_cmpxchg_saddr_i64_rtn(ptr addrspace(1) inre ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_cmpxchg_saddr_i64_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v6, v2 +; GFX12-NEXT: v_mov_b32_e32 v5, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v0, v[3:6], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %cmpxchg = cmpxchg ptr addrspace(1) %gep0, i64 %cmp, i64 %data seq_cst seq_cst @@ -2806,6 +3475,16 @@ define amdgpu_ps <2 x float> @global_cmpxchg_saddr_i64_rtn_neg128(ptr addrspace( ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_cmpxchg_saddr_i64_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v6, v2 +; GFX12-NEXT: v_mov_b32_e32 v5, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v0, v[3:6], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2844,6 +3523,16 @@ define amdgpu_ps void @global_cmpxchg_saddr_i64_nortn(ptr addrspace(1) inreg %sb ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_cmpxchg_saddr_i64_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v6, v2 +; GFX12-NEXT: v_mov_b32_e32 v5, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b64 v0, v[3:6], s[2:3] scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = cmpxchg ptr addrspace(1) %gep0, i64 %cmp, i64 %data seq_cst seq_cst @@ -2879,6 +3568,16 @@ define amdgpu_ps void @global_cmpxchg_saddr_i64_nortn_neg128(ptr addrspace(1) in ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_cmpxchg_saddr_i64_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v6, v2 +; GFX12-NEXT: v_mov_b32_e32 v5, v1 +; GFX12-NEXT: global_wb scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b64 v0, v[3:6], s[2:3] offset:-128 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2902,6 +3601,12 @@ define amdgpu_ps float @global_inc_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: global_atomic_inc_u32 v0, v0, v1, s[2:3] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_inc_saddr_i32_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_inc_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw uinc_wrap ptr addrspace(1) %gep0, i32 %data syncscope("agent") monotonic @@ -2921,6 +3626,12 @@ define amdgpu_ps float @global_inc_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % ; GFX11-NEXT: global_atomic_inc_u32 v0, v0, v1, s[2:3] offset:-128 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_inc_saddr_i32_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_inc_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2939,6 +3650,11 @@ define amdgpu_ps void @global_inc_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_inc_u32 v0, v1, s[2:3] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_inc_saddr_i32_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_inc_u32 v0, v1, s[2:3] scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw uinc_wrap ptr addrspace(1) %gep0, i32 %data syncscope("agent") monotonic @@ -2955,6 +3671,11 @@ define amdgpu_ps void @global_inc_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_inc_u32 v0, v1, s[2:3] offset:-128 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_inc_saddr_i32_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_inc_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -2974,6 +3695,12 @@ define amdgpu_ps <2 x float> @global_inc_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX11-NEXT: global_atomic_inc_u64 v[0:1], v0, v[1:2], s[2:3] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_inc_saddr_i64_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_inc_u64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw uinc_wrap ptr addrspace(1) %gep0, i64 %data syncscope("agent") monotonic @@ -2993,6 +3720,12 @@ define amdgpu_ps <2 x float> @global_inc_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX11-NEXT: global_atomic_inc_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_inc_saddr_i64_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_inc_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -3011,6 +3744,11 @@ define amdgpu_ps void @global_inc_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_inc_u64 v0, v[1:2], s[2:3] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_inc_saddr_i64_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_inc_u64 v0, v[1:2], s[2:3] scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw uinc_wrap ptr addrspace(1) %gep0, i64 %data syncscope("agent") monotonic @@ -3027,6 +3765,11 @@ define amdgpu_ps void @global_inc_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_inc_u64 v0, v[1:2], s[2:3] offset:-128 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_inc_saddr_i64_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_inc_u64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -3051,6 +3794,12 @@ define amdgpu_ps float @global_dec_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, ; GFX11-NEXT: global_atomic_dec_u32 v0, v0, v1, s[2:3] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_dec_saddr_i32_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_dec_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw udec_wrap ptr addrspace(1) %gep0, i32 %data syncscope("agent") monotonic @@ -3070,6 +3819,12 @@ define amdgpu_ps float @global_dec_saddr_i32_rtn_neg128(ptr addrspace(1) inreg % ; GFX11-NEXT: global_atomic_dec_u32 v0, v0, v1, s[2:3] offset:-128 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_dec_saddr_i32_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_dec_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -3088,6 +3843,11 @@ define amdgpu_ps void @global_dec_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_dec_u32 v0, v1, s[2:3] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_dec_saddr_i32_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_dec_u32 v0, v1, s[2:3] scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw udec_wrap ptr addrspace(1) %gep0, i32 %data syncscope("agent") monotonic @@ -3104,6 +3864,11 @@ define amdgpu_ps void @global_dec_saddr_i32_nortn_neg128(ptr addrspace(1) inreg ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_dec_u32 v0, v1, s[2:3] offset:-128 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_dec_saddr_i32_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_dec_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -3123,6 +3888,12 @@ define amdgpu_ps <2 x float> @global_dec_saddr_i64_rtn(ptr addrspace(1) inreg %s ; GFX11-NEXT: global_atomic_dec_u64 v[0:1], v0, v[1:2], s[2:3] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_dec_saddr_i64_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_dec_u64 v[0:1], v0, v[1:2], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %rtn = atomicrmw udec_wrap ptr addrspace(1) %gep0, i64 %data syncscope("agent") monotonic @@ -3142,6 +3913,12 @@ define amdgpu_ps <2 x float> @global_dec_saddr_i64_rtn_neg128(ptr addrspace(1) i ; GFX11-NEXT: global_atomic_dec_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: global_dec_saddr_i64_rtn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_dec_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 @@ -3160,6 +3937,11 @@ define amdgpu_ps void @global_dec_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_dec_u64 v0, v[1:2], s[2:3] ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_dec_saddr_i64_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_dec_u64 v0, v[1:2], s[2:3] scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %unused = atomicrmw udec_wrap ptr addrspace(1) %gep0, i64 %data syncscope("agent") monotonic @@ -3176,6 +3958,11 @@ define amdgpu_ps void @global_dec_saddr_i64_nortn_neg128(ptr addrspace(1) inreg ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_dec_u64 v0, v[1:2], s[2:3] offset:-128 ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_dec_saddr_i64_nortn_neg128: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_atomic_dec_u64 v0, v[1:2], s[2:3] offset:-128 scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll index bf545c82f2d56..19da3f4503aa5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s define amdgpu_ps void @v_interp_f32(float inreg %i, float inreg %j, i32 inreg %m0) #0 { ; GFX11-LABEL: v_interp_f32: @@ -21,6 +22,25 @@ define amdgpu_ps void @v_interp_f32(float inreg %i, float inreg %j, i32 inreg %m ; GFX11-NEXT: v_interp_p2_f32 v4, v1, v4, v5 wait_exp:7 ; GFX11-NEXT: exp mrt0 v3, v2, v5, v4 done ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_interp_f32: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_mov_b32 s3, exec_lo +; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX12-NEXT: s_mov_b32 m0, s2 +; GFX12-NEXT: ds_param_load v0, attr0.y wait_va_vdst:15 wait_vm_vsrc:1 +; GFX12-NEXT: ds_param_load v1, attr1.x wait_va_vdst:15 wait_vm_vsrc:1 +; GFX12-NEXT: s_mov_b32 exec_lo, s3 +; GFX12-NEXT: v_mov_b32_e32 v2, s0 +; GFX12-NEXT: v_mov_b32_e32 v4, s1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_interp_p10_f32 v3, v0, v2, v0 wait_exp:1 +; GFX12-NEXT: v_interp_p10_f32 v2, v1, v2, v1 wait_exp:0 +; GFX12-NEXT: v_interp_p2_f32 v5, v0, v4, v3 wait_exp:7 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_interp_p2_f32 v4, v1, v4, v5 wait_exp:7 +; GFX12-NEXT: export mrt0 v3, v2, v5, v4 done +; GFX12-NEXT: s_endpgm main_body: %p0 = call float @llvm.amdgcn.lds.param.load(i32 1, i32 0, i32 %m0) %p1 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 1, i32 %m0) @@ -57,6 +77,31 @@ define amdgpu_ps void @v_interp_f32_many(float inreg %i, float inreg %j, i32 inr ; GFX11-NEXT: v_interp_p2_f32 v4, v3, v5, v4 wait_exp:7 ; GFX11-NEXT: exp mrt0 v6, v7, v8, v4 done ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_interp_f32_many: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_mov_b32 s3, exec_lo +; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX12-NEXT: s_mov_b32 m0, s2 +; GFX12-NEXT: ds_param_load v0, attr0.x wait_va_vdst:15 wait_vm_vsrc:1 +; GFX12-NEXT: ds_param_load v1, attr1.x wait_va_vdst:15 wait_vm_vsrc:1 +; GFX12-NEXT: ds_param_load v2, attr2.x wait_va_vdst:15 wait_vm_vsrc:1 +; GFX12-NEXT: ds_param_load v3, attr3.x wait_va_vdst:15 wait_vm_vsrc:1 +; GFX12-NEXT: s_mov_b32 exec_lo, s3 +; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_interp_p10_f32 v6, v0, v4, v0 wait_exp:3 +; GFX12-NEXT: v_interp_p10_f32 v7, v1, v4, v1 wait_exp:2 +; GFX12-NEXT: v_interp_p10_f32 v8, v2, v4, v2 wait_exp:1 +; GFX12-NEXT: v_interp_p10_f32 v4, v3, v4, v3 wait_exp:0 +; GFX12-NEXT: v_interp_p2_f32 v6, v0, v5, v6 wait_exp:7 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_interp_p2_f32 v7, v1, v5, v7 wait_exp:7 +; GFX12-NEXT: v_interp_p2_f32 v8, v2, v5, v8 wait_exp:7 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-NEXT: v_interp_p2_f32 v4, v3, v5, v4 wait_exp:7 +; GFX12-NEXT: export mrt0 v6, v7, v8, v4 done +; GFX12-NEXT: s_endpgm main_body: %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0) %p1 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 1, i32 %m0) @@ -99,6 +144,31 @@ define amdgpu_ps void @v_interp_f32_many_vm(ptr addrspace(1) %ptr, i32 inreg %m0 ; GFX11-NEXT: v_interp_p2_f32 v0, v5, v1, v0 wait_exp:7 ; GFX11-NEXT: exp mrt0 v6, v7, v8, v0 done ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: v_interp_f32_many_vm: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: global_load_b64 v[0:1], v[0:1], off offset:4 +; GFX12-NEXT: s_mov_b32 m0, s0 +; GFX12-NEXT: s_mov_b32 s0, exec_lo +; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX12-NEXT: ds_param_load v2, attr0.x wait_va_vdst:15 wait_vm_vsrc:1 +; GFX12-NEXT: ds_param_load v3, attr1.x wait_va_vdst:15 wait_vm_vsrc:1 +; GFX12-NEXT: ds_param_load v4, attr2.x wait_va_vdst:15 wait_vm_vsrc:1 +; GFX12-NEXT: ds_param_load v5, attr3.x wait_va_vdst:15 wait_vm_vsrc:1 +; GFX12-NEXT: s_mov_b32 exec_lo, s0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_interp_p10_f32 v6, v2, v0, v2 wait_exp:3 +; GFX12-NEXT: v_interp_p10_f32 v7, v3, v0, v3 wait_exp:2 +; GFX12-NEXT: v_interp_p10_f32 v8, v4, v0, v4 wait_exp:1 +; GFX12-NEXT: v_interp_p10_f32 v0, v5, v0, v5 wait_exp:0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_interp_p2_f32 v6, v2, v1, v6 wait_exp:7 +; GFX12-NEXT: v_interp_p2_f32 v7, v3, v1, v7 wait_exp:7 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_interp_p2_f32 v8, v4, v1, v8 wait_exp:7 +; GFX12-NEXT: v_interp_p2_f32 v0, v5, v1, v0 wait_exp:7 +; GFX12-NEXT: export mrt0 v6, v7, v8, v0 done +; GFX12-NEXT: s_endpgm main_body: %i.ptr = getelementptr float, ptr addrspace(1) %ptr, i32 1 %i = load float, ptr addrspace(1) %i.ptr, align 4 @@ -156,6 +226,24 @@ define amdgpu_ps half @v_interp_f16(float inreg %i, float inreg %j, i32 inreg %m ; GFX11-FAKE16-NEXT: v_interp_p2_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7 ; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, v3, v0 ; GFX11-FAKE16-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: v_interp_f16: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_mov_b32 s3, exec_lo +; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX12-NEXT: s_mov_b32 m0, s2 +; GFX12-NEXT: ds_param_load v1, attr0.x wait_va_vdst:15 wait_vm_vsrc:1 +; GFX12-NEXT: s_mov_b32 exec_lo, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_interp_p10_f16_f32 v3, v1, v0, v1 wait_exp:0 +; GFX12-NEXT: v_interp_p10_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7 +; GFX12-NEXT: v_interp_p2_f16_f32 v3, v1, v2, v3 wait_exp:7 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_interp_p2_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7 +; GFX12-NEXT: v_add_f16_e32 v0, v3, v0 +; GFX12-NEXT: ; return to shader part epilog main_body: %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0) %l_p0 = call float @llvm.amdgcn.interp.inreg.p10.f16(float %p0, float %i, float %p0, i1 0) @@ -202,6 +290,24 @@ define amdgpu_ps half @v_interp_rtz_f16(float inreg %i, float inreg %j, i32 inre ; GFX11-FAKE16-NEXT: v_interp_p2_rtz_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7 ; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, v3, v0 ; GFX11-FAKE16-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: v_interp_rtz_f16: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_mov_b32 s3, exec_lo +; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX12-NEXT: s_mov_b32 m0, s2 +; GFX12-NEXT: ds_param_load v1, attr0.x wait_va_vdst:15 wait_vm_vsrc:1 +; GFX12-NEXT: s_mov_b32 exec_lo, s3 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_interp_p10_rtz_f16_f32 v3, v1, v0, v1 wait_exp:0 +; GFX12-NEXT: v_interp_p10_rtz_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7 +; GFX12-NEXT: v_interp_p2_rtz_f16_f32 v3, v1, v2, v3 wait_exp:7 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_interp_p2_rtz_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7 +; GFX12-NEXT: v_add_f16_e32 v0, v3, v0 +; GFX12-NEXT: ; return to shader part epilog main_body: %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0) %l_p0 = call float @llvm.amdgcn.interp.p10.rtz.f16(float %p0, float %i, float %p0, i1 0) @@ -237,6 +343,18 @@ define amdgpu_ps half @v_interp_f16_imm_params(float inreg %i, float inreg %j) # ; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, v1, v0 ; GFX11-FAKE16-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: v_interp_f16_imm_params: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX12-NEXT: v_mov_b32_e32 v2, s1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_interp_p10_f16_f32 v1, v0, v1, v0 wait_exp:7 +; GFX12-NEXT: v_interp_p2_f16_f32 v0, v0, v2, v0 wait_exp:7 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX12-NEXT: v_add_f16_e32 v0, v1, v0 +; GFX12-NEXT: ; return to shader part epilog main_body: %l_p0 = call float @llvm.amdgcn.interp.inreg.p10.f16(float 0.0, float %i, float 0.0, i1 0) %l_p1 = call half @llvm.amdgcn.interp.inreg.p2.f16(float 0.0, float %j, float 0.0, i1 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll index 1b41a10eec3fd..e592a4ac5e8fa 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll @@ -1,8 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx1013 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX1013 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX1030 %s +; TODO: Run these for global isel as well. +; RUN: llc -mtriple=amdgcn -mcpu=gfx1013 -verify-machineinstrs < %s | FileCheck -check-prefixes=PRE-GFX12,GFX10,GFX1013 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=PRE-GFX12,GFX10,GFX1030 %s ; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERR %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=PRE-GFX12,GFX11 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s ; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(uint node_ptr, float ray_extent, float3 ray_origin, float3 ray_dir, float3 ray_inv_dir, uint4 texture_descr) ; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(uint node_ptr, float ray_extent, float3 ray_origin, half3 ray_dir, half3 ray_inv_dir, uint4 texture_descr) @@ -18,11 +21,17 @@ declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64, float, <3 ; Arguments are flattened to represent the actual VGPR_A layout, so we have no ; extra moves in the generated kernel. define amdgpu_ps <4 x float> @image_bvh_intersect_ray(i32 %node_ptr, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, float %ray_inv_dir_x, float %ray_inv_dir_y, float %ray_inv_dir_z, <4 x i32> inreg %tdescr) { -; GCN-LABEL: image_bvh_intersect_ray: -; GCN: ; %bb.0: ; %main_body -; GCN-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[0:3] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: ; return to shader part epilog +; PRE-GFX12-LABEL: image_bvh_intersect_ray: +; PRE-GFX12: ; %bb.0: ; %main_body +; PRE-GFX12-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[0:3] +; PRE-GFX12-NEXT: s_waitcnt vmcnt(0) +; PRE-GFX12-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: image_bvh_intersect_ray: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: image_bvh_intersect_ray v[0:3], [v0, v1, v[2:4], v[5:7], v[8:10]], s[0:3] +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: ; return to shader part epilog main_body: %ray_origin0 = insertelement <3 x float> undef, float %ray_origin_x, i32 0 %ray_origin1 = insertelement <3 x float> %ray_origin0, float %ray_origin_y, i32 1 @@ -79,6 +88,48 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 inreg %node_ptr, f ; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[12:15] a16 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-SDAG-LABEL: image_bvh_intersect_ray_a16: +; GFX12-SDAG: ; %bb.0: ; %main_body +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: s_lshr_b32 s2, s7, 16 +; GFX12-SDAG-NEXT: s_lshr_b32 s3, s5, 16 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v7, s1 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: s_pack_ll_b32_b16 s2, s3, s2 +; GFX12-SDAG-NEXT: s_pack_ll_b32_b16 s3, s5, s7 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s3 +; GFX12-SDAG-NEXT: s_pack_ll_b32_b16 s4, s6, s8 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s4 +; GFX12-SDAG-NEXT: s_mov_b32 s15, s12 +; GFX12-SDAG-NEXT: s_mov_b32 s14, s11 +; GFX12-SDAG-NEXT: s_mov_b32 s13, s10 +; GFX12-SDAG-NEXT: s_mov_b32 s12, s9 +; GFX12-SDAG-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[12:15] a16 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: image_bvh_intersect_ray_a16: +; GFX12-GISEL: ; %bb.0: ; %main_body +; GFX12-GISEL-NEXT: s_mov_b32 s20, s2 +; GFX12-GISEL-NEXT: s_mov_b32 s22, s4 +; GFX12-GISEL-NEXT: s_pack_ll_b32_b16 s4, s7, s5 +; GFX12-GISEL-NEXT: s_mov_b32 s21, s3 +; GFX12-GISEL-NEXT: s_pack_hh_b32_b16 s5, s7, s5 +; GFX12-GISEL-NEXT: s_pack_ll_b32_b16 s6, s8, s6 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s20 :: v_dual_mov_b32 v1, s21 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s22 :: v_dual_mov_b32 v3, s4 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v7, s1 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 +; GFX12-GISEL-NEXT: s_mov_b32 s16, s9 +; GFX12-GISEL-NEXT: s_mov_b32 s17, s10 +; GFX12-GISEL-NEXT: s_mov_b32 s18, s11 +; GFX12-GISEL-NEXT: s_mov_b32 s19, s12 +; GFX12-GISEL-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[16:19] a16 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: ; return to shader part epilog main_body: %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> @@ -88,11 +139,17 @@ main_body: ; Arguments are flattened to represent the actual VGPR_A layout, so we have no ; extra moves in the generated kernel. define amdgpu_ps <4 x float> @image_bvh64_intersect_ray(<2 x i32> %node_ptr_vec, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, float %ray_inv_dir_x, float %ray_inv_dir_y, float %ray_inv_dir_z, <4 x i32> inreg %tdescr) { -; GCN-LABEL: image_bvh64_intersect_ray: -; GCN: ; %bb.0: ; %main_body -; GCN-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: ; return to shader part epilog +; PRE-GFX12-LABEL: image_bvh64_intersect_ray: +; PRE-GFX12: ; %bb.0: ; %main_body +; PRE-GFX12-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3] +; PRE-GFX12-NEXT: s_waitcnt vmcnt(0) +; PRE-GFX12-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: image_bvh64_intersect_ray: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: image_bvh64_intersect_ray v[0:3], [v[0:1], v2, v[3:5], v[6:8], v[9:11]], s[0:3] +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: ; return to shader part epilog main_body: %node_ptr = bitcast <2 x i32> %node_ptr_vec to i64 %ray_origin0 = insertelement <3 x float> undef, float %ray_origin_x, i32 0 @@ -152,6 +209,50 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 inreg %node_ptr, ; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[0:2], v[3:5]], s[12:15] a16 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-SDAG-LABEL: image_bvh64_intersect_ray_a16: +; GFX12-SDAG: ; %bb.0: ; %main_body +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s4 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v7, s1 +; GFX12-SDAG-NEXT: s_lshr_b32 s3, s6, 16 +; GFX12-SDAG-NEXT: s_pack_ll_b32_b16 s1, s6, s8 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-SDAG-NEXT: s_lshr_b32 s0, s8, 16 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v8, s2 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: s_pack_ll_b32_b16 s0, s3, s0 +; GFX12-SDAG-NEXT: s_pack_ll_b32_b16 s3, s7, s9 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffe +; GFX12-SDAG-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s3 +; GFX12-SDAG-NEXT: s_mov_b32 s15, s13 +; GFX12-SDAG-NEXT: s_mov_b32 s14, s12 +; GFX12-SDAG-NEXT: s_mov_b32 s13, s11 +; GFX12-SDAG-NEXT: s_mov_b32 s12, s10 +; GFX12-SDAG-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[0:2], v[3:5]], s[12:15] a16 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: image_bvh64_intersect_ray_a16: +; GFX12-GISEL: ; %bb.0: ; %main_body +; GFX12-GISEL-NEXT: s_mov_b32 s20, s3 +; GFX12-GISEL-NEXT: s_mov_b32 s21, s4 +; GFX12-GISEL-NEXT: s_pack_ll_b32_b16 s4, s8, s6 +; GFX12-GISEL-NEXT: s_mov_b32 s22, s5 +; GFX12-GISEL-NEXT: s_pack_hh_b32_b16 s5, s8, s6 +; GFX12-GISEL-NEXT: s_pack_ll_b32_b16 s6, s9, s7 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s20 :: v_dual_mov_b32 v3, s4 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v1, s21 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s22 :: v_dual_mov_b32 v5, s6 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v4, s5 +; GFX12-GISEL-NEXT: s_mov_b32 s16, s10 +; GFX12-GISEL-NEXT: s_mov_b32 s17, s11 +; GFX12-GISEL-NEXT: s_mov_b32 s18, s12 +; GFX12-GISEL-NEXT: s_mov_b32 s19, s13 +; GFX12-GISEL-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[0:2], v[3:5]], s[16:19] a16 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: ; return to shader part epilog main_body: %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) %r = bitcast <4 x i32> %v to <4 x float> @@ -239,6 +340,69 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3] ; GFX11-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: image_bvh_intersect_ray_nsa_reassign: +; GFX12-SDAG: ; %bb.0: ; %main_body +; GFX12-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v7, 1.0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v5, 0x40a00000 :: v_dual_mov_b32 v6, 0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v8, 2.0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v4, 4.0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, s0, v2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 +; GFX12-SDAG-NEXT: v_add_co_u32 v2, s0, s2, v2 +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s0 +; GFX12-SDAG-NEXT: flat_load_b32 v9, v[0:1] +; GFX12-SDAG-NEXT: flat_load_b32 v10, v[2:3] +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0x40e00000 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0x41000000 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, 0x40400000 +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: image_bvh_intersect_ray v[0:3], [v9, v10, v[6:8], v[3:5], v[0:2]], s[4:7] +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: flat_store_b128 v[0:1], v[0:3] +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: image_bvh_intersect_ray_nsa_reassign: +; GFX12-GISEL: ; %bb.0: ; %main_body +; GFX12-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_mov_b32 s10, 0x40a00000 +; GFX12-GISEL-NEXT: s_mov_b32 s9, 4.0 +; GFX12-GISEL-NEXT: s_mov_b32 s8, 0x40400000 +; GFX12-GISEL-NEXT: s_mov_b32 s12, 0x40c00000 +; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; GFX12-GISEL-NEXT: s_mov_b32 s14, 0x41000000 +; GFX12-GISEL-NEXT: s_mov_b32 s13, 0x40e00000 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v6, s12 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v7, s13 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-GISEL-NEXT: s_mov_b32 s2, 2.0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: s_mov_b32 s0, 0 +; GFX12-GISEL-NEXT: s_mov_b32 s1, 1.0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX12-GISEL-NEXT: flat_load_b32 v9, v[0:1] +; GFX12-GISEL-NEXT: flat_load_b32 v10, v[2:3] +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s8 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v5, s10 +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: image_bvh_intersect_ray v[0:3], [v9, v10, v[0:2], v[3:5], v[6:8]], s[4:7] +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: flat_store_b128 v[0:1], v[0:3] +; GFX12-GISEL-NEXT: s_endpgm main_body: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %gep_node_ptr = getelementptr inbounds i32, ptr %p_node_ptr, i32 %lid @@ -329,6 +493,62 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3] ; GFX11-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: +; GFX12-SDAG: ; %bb.0: ; %main_body +; GFX12-SDAG-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v4, 1.0 :: v_dual_mov_b32 v5, 2.0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, s0, v2 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 +; GFX12-SDAG-NEXT: v_add_co_u32 v2, s0, s2, v2 +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s0 +; GFX12-SDAG-NEXT: flat_load_b32 v6, v[0:1] +; GFX12-SDAG-NEXT: flat_load_b32 v7, v[2:3] +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0x47004400 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0x46004200 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0x48004500 :: v_dual_mov_b32 v3, 0 +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[3:5], v[0:2]], s[4:7] a16 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: flat_store_b128 v[0:1], v[0:3] +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: +; GFX12-GISEL: ; %bb.0: ; %main_body +; GFX12-GISEL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_mov_b32 s8, 0x42004600 +; GFX12-GISEL-NEXT: s_mov_b32 s9, 0x44004700 +; GFX12-GISEL-NEXT: s_mov_b32 s10, 0x45004800 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-GISEL-NEXT: s_mov_b32 s2, 2.0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: s_mov_b32 s0, 0 +; GFX12-GISEL-NEXT: s_mov_b32 s1, 1.0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX12-GISEL-NEXT: flat_load_b32 v6, v[0:1] +; GFX12-GISEL-NEXT: flat_load_b32 v7, v[2:3] +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s8 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v5, s10 +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[4:7] a16 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: flat_store_b128 v[0:1], v[0:3] +; GFX12-GISEL-NEXT: s_endpgm main_body: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %gep_node_ptr = getelementptr inbounds i32, ptr %p_node_ptr, i32 %lid @@ -429,6 +649,69 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3] ; GFX11-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: image_bvh64_intersect_ray_nsa_reassign: +; GFX12-SDAG: ; %bb.0: ; %main_body +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v7, 1.0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0x41000000 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0x40400000 :: v_dual_mov_b32 v4, 4.0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v5, 0x40a00000 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v6, 0 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v8, 2.0 :: v_dual_mov_b32 v9, 0xb36211c7 +; GFX12-SDAG-NEXT: v_bfrev_b32_e32 v10, 4.0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_add_co_u32 v0, s4, s6, v0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s4 +; GFX12-SDAG-NEXT: flat_load_b32 v11, v[0:1] +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0x40c00000 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0x40e00000 +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: image_bvh64_intersect_ray v[0:3], [v[9:10], v11, v[6:8], v[3:5], v[0:2]], s[0:3] +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: flat_store_b128 v[0:1], v[0:3] +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: image_bvh64_intersect_ray_nsa_reassign: +; GFX12-GISEL: ; %bb.0: ; %main_body +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_mov_b32 s5, 1.0 +; GFX12-GISEL-NEXT: s_mov_b32 s4, 0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v9, 0xb36211c7 +; GFX12-GISEL-NEXT: s_mov_b32 s8, 0x40400000 +; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX12-GISEL-NEXT: s_mov_b32 s12, 0x40c00000 +; GFX12-GISEL-NEXT: s_mov_b32 s10, 0x40a00000 +; GFX12-GISEL-NEXT: s_mov_b32 s9, 4.0 +; GFX12-GISEL-NEXT: s_mov_b32 s14, 0x41000000 +; GFX12-GISEL-NEXT: s_mov_b32 s13, 0x40e00000 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v6, s12 +; GFX12-GISEL-NEXT: v_bfrev_b32_e32 v10, 4.0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v3, s8 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v7, s13 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v0, s6 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GFX12-GISEL-NEXT: s_mov_b32 s6, 2.0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-GISEL-NEXT: flat_load_b32 v11, v[0:1] +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: image_bvh64_intersect_ray v[0:3], [v[9:10], v11, v[0:2], v[3:5], v[6:8]], s[0:3] +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: flat_store_b128 v[0:1], v[0:3] +; GFX12-GISEL-NEXT: s_endpgm main_body: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %gep_ray = getelementptr inbounds float, ptr %p_ray, i32 %lid @@ -521,6 +804,64 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3] ; GFX11-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: +; GFX12-SDAG: ; %bb.0: ; %main_body +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0x48004500 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v4, 1.0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v6, 0xb36211c6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX12-SDAG-NEXT: v_bfrev_b32_e32 v7, 4.0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v5, 2.0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_add_co_u32 v0, s4, s6, v0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s4 +; GFX12-SDAG-NEXT: flat_load_b32 v8, v[0:1] +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0x47004400 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0x46004200 +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[3:5], v[0:2]], s[0:3] a16 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: flat_store_b128 v[0:1], v[0:3] +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: +; GFX12-GISEL: ; %bb.0: ; %main_body +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 +; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_mov_b32 s5, 1.0 +; GFX12-GISEL-NEXT: s_mov_b32 s4, 0 +; GFX12-GISEL-NEXT: s_mov_b32 s8, 0x42004600 +; GFX12-GISEL-NEXT: s_mov_b32 s9, 0x44004700 +; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX12-GISEL-NEXT: s_mov_b32 s10, 0x45004800 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v6, 0xb36211c6 +; GFX12-GISEL-NEXT: v_bfrev_b32_e32 v7, 4.0 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, s8 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v4, s9 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: s_mov_b32 s6, 2.0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-GISEL-NEXT: flat_load_b32 v8, v[0:1] +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffe +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[0:2], v[3:5]], s[0:3] a16 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: flat_store_b128 v[0:1], v[0:3] +; GFX12-GISEL-NEXT: s_endpgm main_body: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %gep_ray = getelementptr inbounds float, ptr %p_ray, i32 %lid diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll index 049cc455ab01c..53e37479f68e6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll @@ -2,6 +2,8 @@ ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s ; GFX10PLUS-LABEL: {{^}}dpp8_test: ; GFX10PLUS: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}} diff --git a/llvm/test/CodeGen/AMDGPU/mesa3d.ll b/llvm/test/CodeGen/AMDGPU/mesa3d.ll index 7f0f473c11bd5..0460f83b5773e 100644 --- a/llvm/test/CodeGen/AMDGPU/mesa3d.ll +++ b/llvm/test/CodeGen/AMDGPU/mesa3d.ll @@ -1,14 +1,15 @@ ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX11 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX11PLUS %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX11PLUS %s ; SPI_TMPRING_SIZE.WAVESIZE = 5 ; GFX10: .long 165608 ; GFX10-NEXT: .long 20480 ; SPI_TMPRING_SIZE.WAVESIZE = 17 -; GFX11: .long 165608 -; GFX11-NEXT: .long 69632 +; GFX11PLUS: .long 165608 +; GFX11PLUS-NEXT: .long 69632 ; GCN-LABEL: {{^}}scratch_ps: ; GCN: s_load_dwordx2 s[4:5], s[0:1], 0x0{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll b/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll new file mode 100644 index 0000000000000..b90011d95c691 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll @@ -0,0 +1,175 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-GISEL %s + +define amdgpu_cs void @mixed_vmem_types(i32 inreg %globalTable, i32 inreg %perShaderTable, i32 inreg %descTable0, i32 inreg %descTable1, <3 x i32> inreg %WorkgroupId, i32 inreg %MultiDispatchInfo, <3 x i32> %LocalInvocationId) #0 { +; GFX11-LABEL: mixed_vmem_types: +; GFX11: ; %bb.0: ; %.entry +; GFX11-NEXT: s_getpc_b64 s[4:5] +; GFX11-NEXT: s_mov_b32 s0, s3 +; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: s_mov_b32 s1, s5 +; GFX11-NEXT: s_load_b256 s[20:27], s[2:3], 0x40 +; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x0 +; GFX11-NEXT: s_load_b512 s[36:51], s[2:3], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0xbc00bc00 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_load_b32 v1, off, s[20:23], 0 +; GFX11-NEXT: buffer_load_b32 v2, off, s[16:19], 0 +; GFX11-NEXT: image_sample_lz v3, v0, s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX11-NEXT: buffer_load_b32 v4, off, s[40:43], 0 +; GFX11-NEXT: image_sample_lz v0, v0, s[44:51], s[36:39] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0xac0, v1 +; GFX11-NEXT: s_waitcnt vmcnt(3) +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0xac0, v2 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_cmp_eq_f32_e64 s1, 1.0, v3 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0xac0, v4 +; GFX11-NEXT: s_and_b32 s0, s0, vcc_lo +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_and_b32 s0, s0, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s0, s0, s2 +; GFX11-NEXT: s_and_b32 s0, s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[24:27], 0 +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: mixed_vmem_types: +; GFX12: ; %bb.0: ; %.entry +; GFX12-NEXT: s_getpc_b64 s[4:5] +; GFX12-NEXT: s_mov_b32 s0, s3 +; GFX12-NEXT: s_sext_i32_i16 s5, s5 +; GFX12-NEXT: v_mov_b32_e32 v0, 0xbc00bc00 +; GFX12-NEXT: s_mov_b32 s3, s5 +; GFX12-NEXT: s_mov_b32 s1, s5 +; GFX12-NEXT: s_load_b256 s[20:27], s[2:3], 0x40 +; GFX12-NEXT: s_load_b512 s[4:19], s[0:1], 0x0 +; GFX12-NEXT: s_load_b512 s[36:51], s[2:3], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: buffer_load_b32 v1, off, s[20:23], null +; GFX12-NEXT: buffer_load_b32 v2, off, s[16:19], null +; GFX12-NEXT: image_sample_lz v3, v0, s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX12-NEXT: buffer_load_b32 v4, off, s[40:43], null +; GFX12-NEXT: image_sample_lz v0, v0, s[44:51], s[36:39] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX12-NEXT: s_wait_loadcnt 0x2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0xac0, v1 +; GFX12-NEXT: s_wait_loadcnt 0x1 +; GFX12-NEXT: v_cmp_eq_u32_e64 s0, 0xac0, v2 +; GFX12-NEXT: s_wait_samplecnt 0x1 +; GFX12-NEXT: v_cmp_eq_f32_e64 s1, 1.0, v3 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_eq_u32_e64 s2, 0xac0, v4 +; GFX12-NEXT: s_and_b32 s0, s0, vcc_lo +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0 +; GFX12-NEXT: s_and_b32 s0, s0, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_and_b32 s0, s0, s2 +; GFX12-NEXT: s_and_b32 s0, s0, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX12-NEXT: buffer_store_b32 v0, off, s[24:27], null +; GFX12-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: mixed_vmem_types: +; GFX12-GISEL: ; %bb.0: ; %.entry +; GFX12-GISEL-NEXT: s_getpc_b64 s[20:21] +; GFX12-GISEL-NEXT: s_mov_b32 s0, s3 +; GFX12-GISEL-NEXT: s_sext_i32_i16 s21, s21 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0xbc00bc00 +; GFX12-GISEL-NEXT: s_mov_b32 s1, s21 +; GFX12-GISEL-NEXT: s_mov_b32 s3, s21 +; GFX12-GISEL-NEXT: s_load_b512 s[4:19], s[0:1], 0x0 +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b256 s[20:27], s[2:3], 0x40 +; GFX12-GISEL-NEXT: s_load_b512 s[36:51], s[2:3], 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: image_sample_lz v1, v0, s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX12-GISEL-NEXT: buffer_load_b32 v2, off, s[16:19], null +; GFX12-GISEL-NEXT: buffer_load_b32 v3, off, s[20:23], null +; GFX12-GISEL-NEXT: buffer_load_b32 v4, off, s[40:43], null +; GFX12-GISEL-NEXT: image_sample_lz v0, v0, s[44:51], s[36:39] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x2 +; GFX12-GISEL-NEXT: v_cmp_eq_u32_e64 s0, 0xac0, v2 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x1 +; GFX12-GISEL-NEXT: v_cmp_eq_f32_e32 vcc_lo, 1.0, v1 +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x1 +; GFX12-GISEL-NEXT: v_cmp_eq_u32_e64 s1, 0xac0, v3 +; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: v_cmp_eq_u32_e64 s2, 0xac0, v4 +; GFX12-GISEL-NEXT: s_and_b32 s0, s0, vcc_lo +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0 +; GFX12-GISEL-NEXT: s_and_b32 s0, s0, s1 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: s_and_b32 s0, s0, s2 +; GFX12-GISEL-NEXT: s_and_b32 s0, s0, vcc_lo +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX12-GISEL-NEXT: buffer_store_b32 v0, off, s[24:27], null +; GFX12-GISEL-NEXT: s_endpgm +.entry: + %0 = call i64 @llvm.amdgcn.s.getpc() + %extelt.offset = lshr i64 %0, 32 + %.i1 = trunc i64 %extelt.offset to i32 + %.upto0 = insertelement <2 x i32> poison, i32 %descTable1, i64 0 + %1 = insertelement <2 x i32> %.upto0, i32 %.i1, i64 1 + %2 = bitcast <2 x i32> %1 to i64 + %3 = inttoptr i64 %2 to ptr addrspace(4) + %.upto03 = insertelement <2 x i32> poison, i32 %descTable0, i64 0 + %4 = insertelement <2 x i32> %.upto03, i32 %.i1, i64 1 + %5 = bitcast <2 x i32> %4 to i64 + %6 = inttoptr i64 %5 to ptr addrspace(4) + %7 = getelementptr i8, ptr addrspace(4) %6, i64 80 + %8 = load <4 x i32>, ptr addrspace(4) %7, align 16 + %9 = getelementptr i8, ptr addrspace(4) %3, i64 48 + %10 = load <4 x i32>, ptr addrspace(4) %9, align 16 + %11 = getelementptr i8, ptr addrspace(4) %6, i64 64 + %12 = load <4 x i32>, ptr addrspace(4) %11, align 16 + %13 = getelementptr i8, ptr addrspace(4) %6, i64 16 + %14 = load <4 x i32>, ptr addrspace(4) %13, align 16 + %15 = getelementptr i8, ptr addrspace(4) %6, i64 32 + %16 = load <8 x i32>, ptr addrspace(4) %15, align 32 + %17 = load <4 x i32>, ptr addrspace(4) %6, align 16 + %18 = call float @llvm.amdgcn.image.sample.lz.2d.f32.f16(i32 1, half 0xHBC00, half 0xHBC00, <8 x i32> %16, <4 x i32> %17, i1 false, i32 0, i32 0) + %19 = fcmp oeq float %18, 0.000000e+00 + %20 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %14, i32 0, i32 0, i32 0) + %.not = icmp eq i32 %20, 2752 + %21 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %12, i32 0, i32 0, i32 0) + %.not1 = icmp eq i32 %21, 2752 + %22 = getelementptr i8, ptr addrspace(4) %3, i64 16 + %23 = load <8 x i32>, ptr addrspace(4) %22, align 32 + %24 = load <4 x i32>, ptr addrspace(4) %3, align 16 + %25 = call float @llvm.amdgcn.image.sample.lz.2d.f32.f16(i32 1, half 0xHBC00, half 0xHBC00, <8 x i32> %23, <4 x i32> %24, i1 false, i32 0, i32 0) + %26 = fcmp oeq float %25, 1.000000e+00 + %27 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %10, i32 0, i32 0, i32 0) + %.not2 = icmp eq i32 %27, 2752 + %28 = select i1 %.not2, i1 %26, i1 false + %29 = select i1 %28, i1 %.not1, i1 false + %30 = select i1 %29, i1 %.not, i1 false + %narrow2 = select i1 %30, i1 %19, i1 false + %.4 = zext i1 %narrow2 to i32 + call void @llvm.amdgcn.raw.buffer.store.i32(i32 %.4, <4 x i32> %8, i32 0, i32 0, i32 0) + ret void +} + +declare <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32(i32 immarg, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #1 +declare ptr addrspace(7) @lgc.late.launder.fat.pointer(<4 x i32>) #2 +declare i64 @llvm.amdgcn.s.getpc() #3 +declare <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #1 +declare <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f16(i32 immarg, half, half, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #1 +declare float @llvm.amdgcn.image.sample.lz.2d.f32.f16(i32 immarg, half, half, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #1 +declare i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32>, i32, i32, i32 immarg) #1 +declare void @llvm.amdgcn.raw.buffer.store.i32(i32, <4 x i32>, i32, i32, i32 immarg) #4 + +attributes #0 = { nounwind } +attributes #1 = { nounwind willreturn memory(read) } +attributes #2 = { nounwind memory(none) } +attributes #3 = { nounwind speculatable willreturn memory(none) } +attributes #4 = { nounwind willreturn memory(write) } diff --git a/llvm/test/CodeGen/AMDGPU/scalar-float-sop1.ll b/llvm/test/CodeGen/AMDGPU/scalar-float-sop1.ll index 5e6849ec61b47..4865290fd51d8 100644 --- a/llvm/test/CodeGen/AMDGPU/scalar-float-sop1.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar-float-sop1.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -verify-machineinstrs < %s | FileCheck %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -global-isel -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel -verify-machineinstrs < %s | FileCheck %s define amdgpu_vs float @sitofp_i32_to_f32(i32 inreg %val) { ; CHECK-LABEL: sitofp_i32_to_f32: diff --git a/llvm/test/CodeGen/AMDGPU/scalar-float-sop2.ll b/llvm/test/CodeGen/AMDGPU/scalar-float-sop2.ll index cf73803f8929d..81d792183dc06 100644 --- a/llvm/test/CodeGen/AMDGPU/scalar-float-sop2.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar-float-sop2.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -global-isel -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX1150 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -global-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX1150 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX12 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX12 %s define amdgpu_vs float @fadd_f32(float inreg %a, float inreg %b) { ; CHECK-LABEL: fadd_f32: @@ -36,23 +38,37 @@ define amdgpu_vs float @fmul_f32(float inreg %a, float inreg %b) { } define amdgpu_vs float @fmin_f32(float inreg %a, float inreg %b) { -; CHECK-LABEL: fmin_f32: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_min_f32 s0, s0, s1 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_3) -; CHECK-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-NEXT: ; return to shader part epilog +; GFX1150-LABEL: fmin_f32: +; GFX1150: ; %bb.0: +; GFX1150-NEXT: s_min_f32 s0, s0, s1 +; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX1150-NEXT: v_mov_b32_e32 v0, s0 +; GFX1150-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: fmin_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_min_num_f32 s0, s0, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: ; return to shader part epilog %min = call float @llvm.minnum.f32(float %a, float %b) ret float %min } define amdgpu_vs float @fmax_f32(float inreg %a, float inreg %b) { -; CHECK-LABEL: fmax_f32: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_max_f32 s0, s0, s1 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_3) -; CHECK-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-NEXT: ; return to shader part epilog +; GFX1150-LABEL: fmax_f32: +; GFX1150: ; %bb.0: +; GFX1150-NEXT: s_max_f32 s0, s0, s1 +; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX1150-NEXT: v_mov_b32_e32 v0, s0 +; GFX1150-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: fmax_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_max_num_f32 s0, s0, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: ; return to shader part epilog %max = call float @llvm.maxnum.f32(float %a, float %b) ret float %max } @@ -91,23 +107,37 @@ define amdgpu_vs half @fmul_f16(half inreg %a, half inreg %b) { } define amdgpu_vs half @fmin_f16(half inreg %a, half inreg %b) { -; CHECK-LABEL: fmin_f16: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_min_f16 s0, s0, s1 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_3) -; CHECK-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-NEXT: ; return to shader part epilog +; GFX1150-LABEL: fmin_f16: +; GFX1150: ; %bb.0: +; GFX1150-NEXT: s_min_f16 s0, s0, s1 +; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX1150-NEXT: v_mov_b32_e32 v0, s0 +; GFX1150-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: fmin_f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_min_num_f16 s0, s0, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: ; return to shader part epilog %min = call half @llvm.minnum.f16(half %a, half %b) ret half %min } define amdgpu_vs half @fmax_f16(half inreg %a, half inreg %b) { -; CHECK-LABEL: fmax_f16: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_max_f16 s0, s0, s1 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_3) -; CHECK-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-NEXT: ; return to shader part epilog +; GFX1150-LABEL: fmax_f16: +; GFX1150: ; %bb.0: +; GFX1150-NEXT: s_max_f16 s0, s0, s1 +; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX1150-NEXT: v_mov_b32_e32 v0, s0 +; GFX1150-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: fmax_f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_max_num_f16 s0, s0, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: ; return to shader part epilog %max = call half @llvm.maxnum.f16(half %a, half %b) ret half %max } @@ -179,19 +209,33 @@ define amdgpu_vs half @fmac_f16_with_mov(half inreg %a, half inreg %b, half inre ; Regression test for crash in SIFoldOperands define amdgpu_ps float @_amdgpu_ps_main() { -; CHECK-LABEL: _amdgpu_ps_main: -; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_mov_b32 s0, 0 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: s_mov_b32 s1, s0 -; CHECK-NEXT: s_mov_b32 s2, s0 -; CHECK-NEXT: s_mov_b32 s3, s0 -; CHECK-NEXT: s_buffer_load_b64 s[0:1], s[0:3], 0x0 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_fmamk_f32 s0, s1, 0x40800000, s0 -; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_3) -; CHECK-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-NEXT: ; return to shader part epilog +; GFX1150-LABEL: _amdgpu_ps_main: +; GFX1150: ; %bb.0: ; %bb +; GFX1150-NEXT: s_mov_b32 s0, 0 +; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1150-NEXT: s_mov_b32 s1, s0 +; GFX1150-NEXT: s_mov_b32 s2, s0 +; GFX1150-NEXT: s_mov_b32 s3, s0 +; GFX1150-NEXT: s_buffer_load_b64 s[0:1], s[0:3], 0x0 +; GFX1150-NEXT: s_waitcnt lgkmcnt(0) +; GFX1150-NEXT: s_fmamk_f32 s0, s1, 0x40800000, s0 +; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX1150-NEXT: v_mov_b32_e32 v0, s0 +; GFX1150-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: _amdgpu_ps_main: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s1, s0 +; GFX12-NEXT: s_mov_b32 s2, s0 +; GFX12-NEXT: s_mov_b32 s3, s0 +; GFX12-NEXT: s_buffer_load_b64 s[0:1], s[0:3], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_fmamk_f32 s0, s1, 0x40800000, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: ; return to shader part epilog bb: %i = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> zeroinitializer, i32 0, i32 0) %i1 = bitcast i32 %i to float diff --git a/llvm/test/CodeGen/AMDGPU/scalar-float-sopc.ll b/llvm/test/CodeGen/AMDGPU/scalar-float-sopc.ll index ca508eb400170..3d283d6b18507 100644 --- a/llvm/test/CodeGen/AMDGPU/scalar-float-sopc.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar-float-sopc.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -verify-machineinstrs < %s | FileCheck -check-prefix=SDAG %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -global-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=SDAG %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL %s define amdgpu_vs void @f32_olt(ptr addrspace(1) inreg %out, float inreg %a, float inreg %b) { ; SDAG-LABEL: f32_olt: diff --git a/llvm/test/CodeGen/AMDGPU/swizzle.bit.extract.ll b/llvm/test/CodeGen/AMDGPU/swizzle.bit.extract.ll new file mode 100644 index 0000000000000..c5763c68e1dab --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/swizzle.bit.extract.ll @@ -0,0 +1,26 @@ +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=tahiti -verify-machineinstrs -stop-after=amdgpu-isel -o - %s | FileCheck %s --check-prefixes=GCN,PREGFX12-SDAG +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=tahiti -verify-machineinstrs -stop-after=instruction-select -o - %s | FileCheck %s --check-prefixes=GCN,PREGFX12-GISEL +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -stop-after=amdgpu-isel -o - %s | FileCheck %s --check-prefixes=GCN,GFX12PLUS-SDAG +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -stop-after=instruction-select -o - %s | FileCheck %s --check-prefixes=GCN,GFX12PLUS-GISEL + +; GCN-LABEL: name: buffer_swizzle_bit_pregfx12 +; PREGFX12-SDAG: {{%[0-9]+}}:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN {{%[0-9]+}}, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, 0, 1, implicit $exec +; PREGFX12-GISEL: {{%[0-9]+}}:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN {{%[0-9]+}}, {{%[0-9]+}}, {{%[0-9]+}}, 0, 0, 1, implicit $exec +; GFX12PLUS-SDAG: {{%[0-9]+}}:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN killed {{%[0-9]+}}, killed {{%[0-9]+}}, $sgpr_null, 0, 8, 0, implicit $exec +; GFX12PLUS-GISEL: {{%[0-9]+}}:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN {{%[0-9]+}}, {{%[0-9]+}}, $sgpr_null, 0, 8, 0, implicit $exec +define amdgpu_ps <4 x float> @buffer_swizzle_bit_pregfx12(<4 x i32> inreg %0) { + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 8) + ret <4 x float> %data +} + +; GCN-LABEL: name: buffer_swizzle_bit_gfx12plus +; PREGFX12-SDAG: {{%[0-9]+}}:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN {{%[0-9]+}}, killed {{%[0-9]+}}, {{%[0-9]+}}, 0, 0, 0, implicit $exec +; PREGFX12-GISEL: {{%[0-9]+}}:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN {{%[0-9]+}}, {{%[0-9]+}}, {{%[0-9]+}}, 0, 0, 0, implicit $exec +; GFX12PLUS-SDAG: {{%[0-9]+}}:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN killed {{%[0-9]+}}, killed {{%[0-9]+}}, $sgpr_null, 0, 0, 1, implicit $exec +; GFX12PLUS-GISEL: {{%[0-9]+}}:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN {{%[0-9]+}}, {{%[0-9]+}}, $sgpr_null, 0, 0, 1, implicit $exec +define amdgpu_ps <4 x float> @buffer_swizzle_bit_gfx12plus(<4 x i32> inreg %0) { + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 64) + ret <4 x float> %data +} + +declare <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32>, i32, i32, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-overflow.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-overflow.mir index b94e5c450cd17..26b03e7cdf8ad 100644 --- a/llvm/test/CodeGen/AMDGPU/waitcnt-overflow.mir +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-overflow.mir @@ -2,11 +2,13 @@ # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GFX9 %s # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GFX10 %s # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GFX11 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GFX12 %s --- | define amdgpu_kernel void @max-counter-lgkmcnt() #0 { ret void } define amdgpu_kernel void @max-counter-vmcnt() #0 { ret void } define amdgpu_kernel void @max-counter-expcnt() #0 { ret void } + define amdgpu_kernel void @max-counter-dscnt() #0 { ret void } attributes #0 = { "amdgpu-flat-work-group-size"="1,256" } ... @@ -112,6 +114,41 @@ body: | ; GFX11-NEXT: S_WAITCNT 64743 ; GFX11-NEXT: $vgpr6 = V_MAC_F32_e32 0, $vgpr7, $vgpr6, implicit $mode, implicit $exec ; GFX11-NEXT: S_ENDPGM 0 + ; GFX12-LABEL: name: max-counter-lgkmcnt + ; GFX12: liveins: $vgpr99 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: S_WAIT_LOADCNT_DSCNT 0 + ; GFX12-NEXT: S_WAIT_EXPCNT 0 + ; GFX12-NEXT: S_WAIT_SAMPLECNT 0 + ; GFX12-NEXT: S_WAIT_BVHCNT 0 + ; GFX12-NEXT: S_WAIT_KMCNT 0 + ; GFX12-NEXT: $vgpr0_vgpr1 = DS_READ2_B32_gfx9 renamable $vgpr99, 0, 1, 0, implicit $exec + ; GFX12-NEXT: $vgpr2_vgpr3 = DS_READ2_B32_gfx9 renamable $vgpr99, 2, 3, 0, implicit $exec + ; GFX12-NEXT: $vgpr4_vgpr5 = DS_READ2_B32_gfx9 renamable $vgpr99, 4, 5, 0, implicit $exec + ; GFX12-NEXT: $vgpr6_vgpr7 = DS_READ2_B32_gfx9 renamable $vgpr99, 6, 7, 0, implicit $exec + ; GFX12-NEXT: $vgpr8_vgpr9 = DS_READ2_B32_gfx9 renamable $vgpr99, 8, 9, 0, implicit $exec + ; GFX12-NEXT: $vgpr10_vgpr11 = DS_READ2_B32_gfx9 renamable $vgpr99, 10, 11, 0, implicit $exec + ; GFX12-NEXT: $vgpr12_vgpr13 = DS_READ2_B32_gfx9 renamable $vgpr99, 12, 13, 0, implicit $exec + ; GFX12-NEXT: $vgpr14_vgpr15 = DS_READ2_B32_gfx9 renamable $vgpr99, 14, 15, 0, implicit $exec + ; GFX12-NEXT: $vgpr16_vgpr17 = DS_READ2_B32_gfx9 renamable $vgpr99, 16, 17, 0, implicit $exec + ; GFX12-NEXT: $vgpr18_vgpr19 = DS_READ2_B32_gfx9 renamable $vgpr99, 18, 19, 0, implicit $exec + ; GFX12-NEXT: $vgpr20_vgpr21 = DS_READ2_B32_gfx9 renamable $vgpr99, 20, 21, 0, implicit $exec + ; GFX12-NEXT: $vgpr22_vgpr23 = DS_READ2_B32_gfx9 renamable $vgpr99, 22, 23, 0, implicit $exec + ; GFX12-NEXT: $vgpr24_vgpr25 = DS_READ2_B32_gfx9 renamable $vgpr99, 24, 25, 0, implicit $exec + ; GFX12-NEXT: $vgpr26_vgpr27 = DS_READ2_B32_gfx9 renamable $vgpr99, 26, 27, 0, implicit $exec + ; GFX12-NEXT: $vgpr28_vgpr29 = DS_READ2_B32_gfx9 renamable $vgpr99, 28, 29, 0, implicit $exec + ; GFX12-NEXT: $vgpr30_vgpr31 = DS_READ2_B32_gfx9 renamable $vgpr99, 30, 31, 0, implicit $exec + ; GFX12-NEXT: $vgpr32_vgpr33 = DS_READ2_B32_gfx9 renamable $vgpr99, 32, 33, 0, implicit $exec + ; GFX12-NEXT: $vgpr34_vgpr35 = DS_READ2_B32_gfx9 renamable $vgpr99, 34, 35, 0, implicit $exec + ; GFX12-NEXT: S_WAIT_DSCNT 17 + ; GFX12-NEXT: $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec + ; GFX12-NEXT: S_WAIT_DSCNT 16 + ; GFX12-NEXT: $vgpr2 = V_MAC_F32_e32 0, $vgpr3, $vgpr2, implicit $mode, implicit $exec + ; GFX12-NEXT: S_WAIT_DSCNT 15 + ; GFX12-NEXT: $vgpr4 = V_MAC_F32_e32 0, $vgpr5, $vgpr4, implicit $mode, implicit $exec + ; GFX12-NEXT: S_WAIT_DSCNT 14 + ; GFX12-NEXT: $vgpr6 = V_MAC_F32_e32 0, $vgpr7, $vgpr6, implicit $mode, implicit $exec + ; GFX12-NEXT: S_ENDPGM 0 $vgpr0_vgpr1 = DS_READ2_B32_gfx9 renamable $vgpr99, 0, 1, 0, implicit $exec $vgpr2_vgpr3 = DS_READ2_B32_gfx9 renamable $vgpr99, 2, 3, 0, implicit $exec $vgpr4_vgpr5 = DS_READ2_B32_gfx9 renamable $vgpr99, 4, 5, 0, implicit $exec @@ -377,6 +414,87 @@ body: | ; GFX11-NEXT: $vgpr2 = V_MAC_F32_e32 0, $vgpr3, $vgpr2, implicit $mode, implicit $exec ; GFX11-NEXT: $vgpr3 = V_MAC_F32_e32 0, $vgpr4, $vgpr3, implicit $mode, implicit $exec ; GFX11-NEXT: S_ENDPGM 0 + ; GFX12-LABEL: name: max-counter-vmcnt + ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: S_WAIT_LOADCNT_DSCNT 0 + ; GFX12-NEXT: S_WAIT_EXPCNT 0 + ; GFX12-NEXT: S_WAIT_SAMPLECNT 0 + ; GFX12-NEXT: S_WAIT_BVHCNT 0 + ; GFX12-NEXT: S_WAIT_KMCNT 0 + ; GFX12-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 12, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 16, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 20, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 24, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 28, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr8 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 32, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr9 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 36, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr10 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 40, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr11 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 44, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr12 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 48, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr13 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 52, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 56, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 60, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr16 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 64, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr17 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 68, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr18 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 72, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr19 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 76, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr20 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 80, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr21 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 84, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr22 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 88, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr23 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 92, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr24 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 96, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr25 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 100, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr26 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 104, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr27 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 108, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr28 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 112, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr29 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 116, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr30 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 120, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr31 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 124, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 128, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr33 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 132, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr34 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 136, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr35 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 140, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr36 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 144, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr37 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 148, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr38 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 152, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr39 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 156, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 160, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr41 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 164, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr42 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 168, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr43 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 172, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr44 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 176, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr45 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 180, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr46 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 184, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr47 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 188, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr48 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 192, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr49 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 196, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr50 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 200, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr51 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 204, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr52 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 208, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr53 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 212, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr54 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 216, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr55 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 220, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr56 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 224, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr57 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 228, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr58 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 232, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr59 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 236, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr60 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 240, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr61 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 244, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr62 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 248, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr63 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 252, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr64 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 256, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr65 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 260, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr66 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 264, 0, 0, implicit $exec + ; GFX12-NEXT: S_WAIT_LOADCNT 62 + ; GFX12-NEXT: $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec + ; GFX12-NEXT: $vgpr1 = V_MAC_F32_e32 0, $vgpr2, $vgpr1, implicit $mode, implicit $exec + ; GFX12-NEXT: $vgpr2 = V_MAC_F32_e32 0, $vgpr3, $vgpr2, implicit $mode, implicit $exec + ; GFX12-NEXT: $vgpr3 = V_MAC_F32_e32 0, $vgpr4, $vgpr3, implicit $mode, implicit $exec + ; GFX12-NEXT: S_ENDPGM 0 $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, implicit $exec @@ -502,6 +620,24 @@ body: | ; GFX11-NEXT: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec ; GFX11-NEXT: $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec ; GFX11-NEXT: S_ENDPGM 0 + ; GFX12-LABEL: name: max-counter-expcnt + ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0, $vgpr1 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: S_WAIT_LOADCNT_DSCNT 0 + ; GFX12-NEXT: S_WAIT_EXPCNT 0 + ; GFX12-NEXT: S_WAIT_SAMPLECNT 0 + ; GFX12-NEXT: S_WAIT_BVHCNT 0 + ; GFX12-NEXT: S_WAIT_KMCNT 0 + ; GFX12-NEXT: EXP 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec + ; GFX12-NEXT: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + ; GFX12-NEXT: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + ; GFX12-NEXT: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + ; GFX12-NEXT: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + ; GFX12-NEXT: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + ; GFX12-NEXT: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + ; GFX12-NEXT: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + ; GFX12-NEXT: $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec + ; GFX12-NEXT: S_ENDPGM 0 EXP 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec @@ -513,3 +649,245 @@ body: | $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec S_ENDPGM 0 ... + +--- +name: max-counter-dscnt +body: | + bb.0: + liveins: $vgpr99 + + ; GFX9-LABEL: name: max-counter-dscnt + ; GFX9: liveins: $vgpr99 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: S_WAITCNT 0 + ; GFX9-NEXT: $vgpr0 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr1 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr2 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr3 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr4 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr5 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr6 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr7 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr8 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr9 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr10 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr11 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr12 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr13 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr14 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr15 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr16 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr17 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr18 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr19 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr20 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr21 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr22 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr23 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr24 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr25 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr26 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr27 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr28 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr29 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr30 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr31 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr32 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr33 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr34 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr35 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr36 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr37 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr38 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: $vgpr39 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX9-NEXT: S_WAITCNT 52863 + ; GFX9-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec + ; GFX9-NEXT: S_ENDPGM 0 + ; GFX10-LABEL: name: max-counter-dscnt + ; GFX10: liveins: $vgpr99 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: S_WAITCNT 0 + ; GFX10-NEXT: $vgpr0 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr1 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr2 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr3 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr4 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr5 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr6 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr7 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr8 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr9 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr10 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr11 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr12 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr13 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr14 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr15 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr16 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr17 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr18 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr19 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr20 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr21 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr22 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr23 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr24 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr25 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr26 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr27 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr28 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr29 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr30 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr31 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr32 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr33 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr34 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr35 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr36 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr37 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr38 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: $vgpr39 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX10-NEXT: S_WAITCNT 59263 + ; GFX10-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec + ; GFX10-NEXT: S_ENDPGM 0 + ; GFX11-LABEL: name: max-counter-dscnt + ; GFX11: liveins: $vgpr99 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: S_WAITCNT 0 + ; GFX11-NEXT: $vgpr0 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr1 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr2 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr3 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr4 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr5 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr6 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr7 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr8 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr9 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr10 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr11 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr12 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr13 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr14 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr15 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr16 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr17 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr18 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr19 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr20 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr21 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr22 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr23 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr24 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr25 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr26 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr27 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr28 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr29 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr30 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr31 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr32 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr33 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr34 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr35 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr36 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr37 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr38 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr39 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX11-NEXT: S_WAITCNT 65143 + ; GFX11-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0 + ; GFX12-LABEL: name: max-counter-dscnt + ; GFX12: liveins: $vgpr99 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: S_WAIT_LOADCNT_DSCNT 0 + ; GFX12-NEXT: S_WAIT_EXPCNT 0 + ; GFX12-NEXT: S_WAIT_SAMPLECNT 0 + ; GFX12-NEXT: S_WAIT_BVHCNT 0 + ; GFX12-NEXT: S_WAIT_KMCNT 0 + ; GFX12-NEXT: $vgpr0 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr1 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr2 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr3 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr4 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr5 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr6 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr7 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr8 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr9 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr10 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr11 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr12 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr13 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr14 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr15 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr16 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr17 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr18 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr19 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr20 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr21 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr22 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr23 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr24 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr25 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr26 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr27 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr28 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr29 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr30 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr31 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr32 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr33 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr34 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr35 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr36 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr37 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr38 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: $vgpr39 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + ; GFX12-NEXT: S_WAIT_DSCNT 39 + ; GFX12-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec + ; GFX12-NEXT: S_ENDPGM 0 + $vgpr0 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr1 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr2 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr3 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr4 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr5 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr6 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr7 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr8 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr9 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr10 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr11 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr12 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr13 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr14 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr15 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr16 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr17 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr18 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr19 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr20 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr21 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr22 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr23 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr24 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr25 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr26 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr27 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr28 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr29 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr30 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr31 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr32 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr33 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr34 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr35 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr36 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr37 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr38 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr39 = DS_READ_B32_gfx9 $vgpr99, 0, 0, implicit $exec + $vgpr0 = V_ADD_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec + S_ENDPGM 0 +... diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_features.s b/llvm/test/MC/AMDGPU/gfx12_asm_features.s index 7393de2878f8a..776b4ca595099 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_features.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_features.s @@ -66,6 +66,12 @@ image_sample v[29:30], [v31, v32, v33], s[32:39], s[68:71] dmask:0x3 dim:SQ_RSRC image_sample v[29:30], [v31, v32, v33], s[32:39], s[68:71] dmask:0x3 dim:SQ_RSRC_IMG_3D scope:SCOPE_SYS th:TH_LOAD_NT // GFX12: encoding: [0x02,0xc0,0xc6,0xe4,0x1d,0x40,0x1c,0x22,0x1f,0x20,0x21,0x00] +global_load_block v[9:40], v[5:6], off th:TH_LOAD_HT scope:SCOPE_SE +// GFX12: encoding: [0x7c,0xc0,0x14,0xee,0x09,0x00,0x24,0x00,0x05,0x00,0x00,0x00] + +global_load_block v[9:40], v[5:6], off scope:SCOPE_SE th:TH_LOAD_HT +// GFX12: encoding: [0x7c,0xc0,0x14,0xee,0x09,0x00,0x24,0x00,0x05,0x00,0x00,0x00] + buffer_load_b32 v5, off, s[8:11], s3 offset:8388607 th:TH_LOAD_NT_HT scope:SCOPE_DEV // GFX12: encoding: [0x03,0x00,0x05,0xc4,0x05,0x10,0xe8,0x00,0x00,0xff,0xff,0x7f] From 8b0f05b207ffe9111a485ec1b42ea5564d6baf1d Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Fri, 20 Dec 2024 09:31:47 +0000 Subject: [PATCH 2/3] Use named values --- llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll | 89 ++++++++------------ 1 file changed, 37 insertions(+), 52 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll b/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll index b90011d95c691..0f67a404972aa 100644 --- a/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll +++ b/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll @@ -115,61 +115,46 @@ define amdgpu_cs void @mixed_vmem_types(i32 inreg %globalTable, i32 inreg %perSh ; GFX12-GISEL-NEXT: buffer_store_b32 v0, off, s[24:27], null ; GFX12-GISEL-NEXT: s_endpgm .entry: - %0 = call i64 @llvm.amdgcn.s.getpc() - %extelt.offset = lshr i64 %0, 32 + %i = call i64 @llvm.amdgcn.s.getpc() + %extelt.offset = lshr i64 %i, 32 %.i1 = trunc i64 %extelt.offset to i32 %.upto0 = insertelement <2 x i32> poison, i32 %descTable1, i64 0 - %1 = insertelement <2 x i32> %.upto0, i32 %.i1, i64 1 - %2 = bitcast <2 x i32> %1 to i64 - %3 = inttoptr i64 %2 to ptr addrspace(4) + %i1 = insertelement <2 x i32> %.upto0, i32 %.i1, i64 1 + %i2 = bitcast <2 x i32> %i1 to i64 + %i3 = inttoptr i64 %i2 to ptr addrspace(4) %.upto03 = insertelement <2 x i32> poison, i32 %descTable0, i64 0 - %4 = insertelement <2 x i32> %.upto03, i32 %.i1, i64 1 - %5 = bitcast <2 x i32> %4 to i64 - %6 = inttoptr i64 %5 to ptr addrspace(4) - %7 = getelementptr i8, ptr addrspace(4) %6, i64 80 - %8 = load <4 x i32>, ptr addrspace(4) %7, align 16 - %9 = getelementptr i8, ptr addrspace(4) %3, i64 48 - %10 = load <4 x i32>, ptr addrspace(4) %9, align 16 - %11 = getelementptr i8, ptr addrspace(4) %6, i64 64 - %12 = load <4 x i32>, ptr addrspace(4) %11, align 16 - %13 = getelementptr i8, ptr addrspace(4) %6, i64 16 - %14 = load <4 x i32>, ptr addrspace(4) %13, align 16 - %15 = getelementptr i8, ptr addrspace(4) %6, i64 32 - %16 = load <8 x i32>, ptr addrspace(4) %15, align 32 - %17 = load <4 x i32>, ptr addrspace(4) %6, align 16 - %18 = call float @llvm.amdgcn.image.sample.lz.2d.f32.f16(i32 1, half 0xHBC00, half 0xHBC00, <8 x i32> %16, <4 x i32> %17, i1 false, i32 0, i32 0) - %19 = fcmp oeq float %18, 0.000000e+00 - %20 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %14, i32 0, i32 0, i32 0) - %.not = icmp eq i32 %20, 2752 - %21 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %12, i32 0, i32 0, i32 0) - %.not1 = icmp eq i32 %21, 2752 - %22 = getelementptr i8, ptr addrspace(4) %3, i64 16 - %23 = load <8 x i32>, ptr addrspace(4) %22, align 32 - %24 = load <4 x i32>, ptr addrspace(4) %3, align 16 - %25 = call float @llvm.amdgcn.image.sample.lz.2d.f32.f16(i32 1, half 0xHBC00, half 0xHBC00, <8 x i32> %23, <4 x i32> %24, i1 false, i32 0, i32 0) - %26 = fcmp oeq float %25, 1.000000e+00 - %27 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %10, i32 0, i32 0, i32 0) - %.not2 = icmp eq i32 %27, 2752 - %28 = select i1 %.not2, i1 %26, i1 false - %29 = select i1 %28, i1 %.not1, i1 false - %30 = select i1 %29, i1 %.not, i1 false - %narrow2 = select i1 %30, i1 %19, i1 false + %i4 = insertelement <2 x i32> %.upto03, i32 %.i1, i64 1 + %i5 = bitcast <2 x i32> %i4 to i64 + %i6 = inttoptr i64 %i5 to ptr addrspace(4) + %i7 = getelementptr i8, ptr addrspace(4) %i6, i64 80 + %i8 = load <4 x i32>, ptr addrspace(4) %i7, align 16 + %i9 = getelementptr i8, ptr addrspace(4) %i3, i64 48 + %i10 = load <4 x i32>, ptr addrspace(4) %i9, align 16 + %i11 = getelementptr i8, ptr addrspace(4) %i6, i64 64 + %i12 = load <4 x i32>, ptr addrspace(4) %i11, align 16 + %i13 = getelementptr i8, ptr addrspace(4) %i6, i64 16 + %i14 = load <4 x i32>, ptr addrspace(4) %i13, align 16 + %i15 = getelementptr i8, ptr addrspace(4) %i6, i64 32 + %i16 = load <8 x i32>, ptr addrspace(4) %i15, align 32 + %i17 = load <4 x i32>, ptr addrspace(4) %i6, align 16 + %i18 = call float @llvm.amdgcn.image.sample.lz.2d.f32.f16.v8i32.v4i32(i32 1, half 0xHBC00, half 0xHBC00, <8 x i32> %i16, <4 x i32> %i17, i1 false, i32 0, i32 0) + %i19 = fcmp oeq float %i18, 0.000000e+00 + %i20 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %i14, i32 0, i32 0, i32 0) + %.not = icmp eq i32 %i20, 2752 + %i21 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %i12, i32 0, i32 0, i32 0) + %.not1 = icmp eq i32 %i21, 2752 + %i22 = getelementptr i8, ptr addrspace(4) %i3, i64 16 + %i23 = load <8 x i32>, ptr addrspace(4) %i22, align 32 + %i24 = load <4 x i32>, ptr addrspace(4) %i3, align 16 + %i25 = call float @llvm.amdgcn.image.sample.lz.2d.f32.f16.v8i32.v4i32(i32 1, half 0xHBC00, half 0xHBC00, <8 x i32> %i23, <4 x i32> %i24, i1 false, i32 0, i32 0) + %i26 = fcmp oeq float %i25, 1.000000e+00 + %i27 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %i10, i32 0, i32 0, i32 0) + %.not2 = icmp eq i32 %i27, 2752 + %i28 = select i1 %.not2, i1 %i26, i1 false + %i29 = select i1 %i28, i1 %.not1, i1 false + %i30 = select i1 %i29, i1 %.not, i1 false + %narrow2 = select i1 %i30, i1 %i19, i1 false %.4 = zext i1 %narrow2 to i32 - call void @llvm.amdgcn.raw.buffer.store.i32(i32 %.4, <4 x i32> %8, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.i32(i32 %.4, <4 x i32> %i8, i32 0, i32 0, i32 0) ret void } - -declare <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32(i32 immarg, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #1 -declare ptr addrspace(7) @lgc.late.launder.fat.pointer(<4 x i32>) #2 -declare i64 @llvm.amdgcn.s.getpc() #3 -declare <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #1 -declare <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f16(i32 immarg, half, half, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #1 -declare float @llvm.amdgcn.image.sample.lz.2d.f32.f16(i32 immarg, half, half, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #1 -declare i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32>, i32, i32, i32 immarg) #1 -declare void @llvm.amdgcn.raw.buffer.store.i32(i32, <4 x i32>, i32, i32, i32 immarg) #4 - -attributes #0 = { nounwind } -attributes #1 = { nounwind willreturn memory(read) } -attributes #2 = { nounwind memory(none) } -attributes #3 = { nounwind speculatable willreturn memory(none) } -attributes #4 = { nounwind willreturn memory(write) } From 5800790059573b974ef6741710f0baca107b0abf Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Fri, 20 Dec 2024 09:36:41 +0000 Subject: [PATCH 3/3] Remove redundant test --- llvm/test/CodeGen/AMDGPU/fmaximum3.ll | 66 --------------------------- 1 file changed, 66 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll index 64838a98f34d2..f0fa621e3b4bc 100644 --- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll @@ -3766,69 +3766,3 @@ define <2 x double> @v_no_fmaximum3_f64__multi_use(double %a, double %b, double %insert.1 = insertelement <2 x double> %insert.0, double %max1, i32 1 ret <2 x double> %insert.1 } - -; Checks whether the test passes; performMinMaxCombine() should not optimize vector patterns of maximum3 -; since there are no pack instructions for fmaximum3. -define <2 x half> @no_fmaximum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) { -; GFX12-LABEL: no_fmaximum3_v2f16: -; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_pk_maximum_f16 v0, v2, v0 -; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v3 -; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: no_fmaximum3_v2f16: -; GFX940: ; %bb.0: ; %entry -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_max_f16 v4, v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX940-NEXT: s_mov_b32 s0, 0x5040100 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX940-NEXT: v_perm_b32 v1, v0, v6, s0 -; GFX940-NEXT: v_pk_max_f16 v1, v2, v1 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v2, v6 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v1, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v2, v0 src0_sel:WORD_1 src1_sel:DWORD -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc -; GFX940-NEXT: v_perm_b32 v1, v0, v4, s0 -; GFX940-NEXT: v_pk_max_f16 v1, v1, v3 -; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v4, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v5, v1, vcc -; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v3 src0_sel:DWORD src1_sel:WORD_1 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc -; GFX940-NEXT: v_perm_b32 v0, v0, v2, s0 -; GFX940-NEXT: s_setpc_b64 s[30:31] -; -; GFX950-LABEL: no_fmaximum3_v2f16: -; GFX950: ; %bb.0: ; %entry -; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v2, v0, v0 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v3, v3 -; GFX950-NEXT: s_setpc_b64 s[30:31] -entry: - %max = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b) - %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %c, <2 x half> %max) - %res = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max1, <2 x half> %d) - ret <2 x half> %res -}