From e030c29267df783c5b28ae4026d8850a7ec894a7 Mon Sep 17 00:00:00 2001 From: pvanhout Date: Thu, 21 Aug 2025 12:13:54 +0200 Subject: [PATCH 1/2] [AMDGPU][gfx1250] Add memory legalizer tests --- .../memory-legalizer-fence-mmra-global.ll | 171 ++ .../memory-legalizer-fence-mmra-local.ll | 100 + .../CodeGen/AMDGPU/memory-legalizer-fence.ll | 239 +++ .../AMDGPU/memory-legalizer-flat-agent.ll | 1651 ++++++++++++++++ .../AMDGPU/memory-legalizer-flat-lastuse.ll | 51 + .../memory-legalizer-flat-nontemporal.ll | 65 + .../memory-legalizer-flat-singlethread.ll | 1241 ++++++++++++ .../AMDGPU/memory-legalizer-flat-system.ll | 1709 +++++++++++++++++ .../AMDGPU/memory-legalizer-flat-volatile.ll | 80 + .../AMDGPU/memory-legalizer-flat-wavefront.ll | 1225 ++++++++++++ .../AMDGPU/memory-legalizer-flat-workgroup.ll | 1223 ++++++++++++ .../AMDGPU/memory-legalizer-global-agent.ll | 1613 ++++++++++++++++ .../AMDGPU/memory-legalizer-global-lastuse.ll | 56 + .../memory-legalizer-global-nontemporal.ll | 67 + .../memory-legalizer-global-singlethread.ll | 1241 ++++++++++++ .../AMDGPU/memory-legalizer-global-system.ll | 1586 +++++++++++++++ .../memory-legalizer-global-volatile.ll | 81 + .../memory-legalizer-global-wavefront.ll | 1241 ++++++++++++ .../memory-legalizer-global-workgroup.ll | 1271 ++++++++++++ .../AMDGPU/memory-legalizer-local-agent.ll | 1211 ++++++++++++ .../memory-legalizer-local-nontemporal.ll | 69 + .../memory-legalizer-local-singlethread.ll | 1165 +++++++++++ .../AMDGPU/memory-legalizer-local-system.ll | 1211 ++++++++++++ .../AMDGPU/memory-legalizer-local-volatile.ll | 80 + .../memory-legalizer-local-wavefront.ll | 1165 +++++++++++ 25 files changed, 19812 insertions(+) diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll index 80445f793934b..20822c71198b6 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll @@ -12,6 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s define amdgpu_kernel void @workgroup_acquire_fence() { ; GFX6-LABEL: workgroup_acquire_fence: @@ -78,6 +79,10 @@ define amdgpu_kernel void @workgroup_acquire_fence() { ; GFX12-CU-LABEL: workgroup_acquire_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: workgroup_acquire_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup") acquire, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -145,6 +150,10 @@ define amdgpu_kernel void @workgroup_release_fence() { ; GFX12-CU-LABEL: workgroup_release_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: workgroup_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup") release, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -217,6 +226,10 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; GFX12-CU-LABEL: workgroup_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: workgroup_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -289,6 +302,10 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; GFX12-CU-LABEL: workgroup_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: workgroup_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -359,6 +376,10 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() { ; GFX12-CU-LABEL: workgroup_one_as_acquire_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: workgroup_one_as_acquire_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -426,6 +447,10 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() { ; GFX12-CU-LABEL: workgroup_one_as_release_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: workgroup_one_as_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") release, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -498,6 +523,10 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; GFX12-CU-LABEL: workgroup_one_as_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: workgroup_one_as_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -570,6 +599,10 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; GFX12-CU-LABEL: workgroup_one_as_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: workgroup_one_as_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -662,6 +695,13 @@ define amdgpu_kernel void @agent_acquire_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: agent_acquire_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent") acquire, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -744,6 +784,14 @@ define amdgpu_kernel void @agent_release_fence() { ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: agent_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent") release, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -842,6 +890,15 @@ define amdgpu_kernel void @agent_acq_rel_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: agent_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -940,6 +997,15 @@ define amdgpu_kernel void @agent_seq_cst_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: agent_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1032,6 +1098,13 @@ define amdgpu_kernel void @agent_one_as_acquire_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: agent_one_as_acquire_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent-one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1114,6 +1187,14 @@ define amdgpu_kernel void @agent_one_as_release_fence() { ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: agent_one_as_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent-one-as") release, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1212,6 +1293,15 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: agent_one_as_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent-one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1310,6 +1400,15 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: agent_one_as_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent-one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1404,6 +1503,13 @@ define amdgpu_kernel void @system_acquire_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: system_acquire_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm entry: fence acquire, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1490,6 +1596,15 @@ define amdgpu_kernel void @system_release_fence() { ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: system_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence release, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1594,6 +1709,16 @@ define amdgpu_kernel void @system_acq_rel_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: system_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm entry: fence acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1698,6 +1823,16 @@ define amdgpu_kernel void @system_seq_cst_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: system_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm entry: fence seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1792,6 +1927,13 @@ define amdgpu_kernel void @system_one_as_acquire_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: system_one_as_acquire_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1878,6 +2020,15 @@ define amdgpu_kernel void @system_one_as_release_fence() { ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: system_one_as_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("one-as") release, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1982,6 +2133,16 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: system_one_as_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -2086,6 +2247,16 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: system_one_as_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll index 7a419a5031ba9..767dbc1432242 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll @@ -12,6 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s define amdgpu_kernel void @workgroup_acquire_fence() { ; GFX6-LABEL: workgroup_acquire_fence: @@ -76,6 +77,11 @@ define amdgpu_kernel void @workgroup_acquire_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: workgroup_acquire_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup") acquire, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -142,6 +148,10 @@ define amdgpu_kernel void @workgroup_release_fence() { ; GFX12-CU-LABEL: workgroup_release_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: workgroup_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup") release, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -208,6 +218,10 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; GFX12-CU-LABEL: workgroup_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: workgroup_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -274,6 +288,10 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; GFX12-CU-LABEL: workgroup_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: workgroup_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -331,6 +349,10 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() { ; GFX12-CU-LABEL: workgroup_one_as_acquire_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: workgroup_one_as_acquire_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -388,6 +410,10 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() { ; GFX12-CU-LABEL: workgroup_one_as_release_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: workgroup_one_as_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") release, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -445,6 +471,10 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; GFX12-CU-LABEL: workgroup_one_as_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: workgroup_one_as_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -502,6 +532,10 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; GFX12-CU-LABEL: workgroup_one_as_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: workgroup_one_as_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -570,6 +604,11 @@ define amdgpu_kernel void @agent_acquire_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: agent_acquire_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent") acquire, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -636,6 +675,10 @@ define amdgpu_kernel void @agent_release_fence() { ; GFX12-CU-LABEL: agent_release_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: agent_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent") release, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -702,6 +745,10 @@ define amdgpu_kernel void @agent_acq_rel_fence() { ; GFX12-CU-LABEL: agent_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: agent_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -768,6 +815,10 @@ define amdgpu_kernel void @agent_seq_cst_fence() { ; GFX12-CU-LABEL: agent_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: agent_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -825,6 +876,10 @@ define amdgpu_kernel void @agent_one_as_acquire_fence() { ; GFX12-CU-LABEL: agent_one_as_acquire_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: agent_one_as_acquire_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent-one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -882,6 +937,10 @@ define amdgpu_kernel void @agent_one_as_release_fence() { ; GFX12-CU-LABEL: agent_one_as_release_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: agent_one_as_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent-one-as") release, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -939,6 +998,10 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() { ; GFX12-CU-LABEL: agent_one_as_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: agent_one_as_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent-one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -996,6 +1059,10 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() { ; GFX12-CU-LABEL: agent_one_as_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: agent_one_as_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent-one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -1064,6 +1131,11 @@ define amdgpu_kernel void @system_acquire_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: system_acquire_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence acquire, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -1130,6 +1202,10 @@ define amdgpu_kernel void @system_release_fence() { ; GFX12-CU-LABEL: system_release_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: system_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence release, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -1196,6 +1272,10 @@ define amdgpu_kernel void @system_acq_rel_fence() { ; GFX12-CU-LABEL: system_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: system_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -1262,6 +1342,10 @@ define amdgpu_kernel void @system_seq_cst_fence() { ; GFX12-CU-LABEL: system_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: system_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -1319,6 +1403,10 @@ define amdgpu_kernel void @system_one_as_acquire_fence() { ; GFX12-CU-LABEL: system_one_as_acquire_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: system_one_as_acquire_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -1376,6 +1464,10 @@ define amdgpu_kernel void @system_one_as_release_fence() { ; GFX12-CU-LABEL: system_one_as_release_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: system_one_as_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("one-as") release, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -1433,6 +1525,10 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() { ; GFX12-CU-LABEL: system_one_as_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: system_one_as_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -1490,6 +1586,10 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() { ; GFX12-CU-LABEL: system_one_as_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: system_one_as_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll index 0e459ed0f1243..8d7194b834385 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll @@ -12,6 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s define amdgpu_kernel void @singlethread_acquire_fence() { ; GFX6-LABEL: singlethread_acquire_fence: @@ -65,6 +66,10 @@ define amdgpu_kernel void @singlethread_acquire_fence() { ; GFX12-CU-LABEL: singlethread_acquire_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: singlethread_acquire_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("singlethread") acquire ret void @@ -122,6 +127,10 @@ define amdgpu_kernel void @singlethread_release_fence() { ; GFX12-CU-LABEL: singlethread_release_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: singlethread_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("singlethread") release ret void @@ -179,6 +188,10 @@ define amdgpu_kernel void @singlethread_acq_rel_fence() { ; GFX12-CU-LABEL: singlethread_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: singlethread_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("singlethread") acq_rel ret void @@ -236,6 +249,10 @@ define amdgpu_kernel void @singlethread_seq_cst_fence() { ; GFX12-CU-LABEL: singlethread_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: singlethread_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("singlethread") seq_cst ret void @@ -293,6 +310,10 @@ define amdgpu_kernel void @singlethread_one_as_acquire_fence() { ; GFX12-CU-LABEL: singlethread_one_as_acquire_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: singlethread_one_as_acquire_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("singlethread-one-as") acquire ret void @@ -350,6 +371,10 @@ define amdgpu_kernel void @singlethread_one_as_release_fence() { ; GFX12-CU-LABEL: singlethread_one_as_release_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: singlethread_one_as_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("singlethread-one-as") release ret void @@ -407,6 +432,10 @@ define amdgpu_kernel void @singlethread_one_as_acq_rel_fence() { ; GFX12-CU-LABEL: singlethread_one_as_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: singlethread_one_as_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("singlethread-one-as") acq_rel ret void @@ -464,6 +493,10 @@ define amdgpu_kernel void @singlethread_one_as_seq_cst_fence() { ; GFX12-CU-LABEL: singlethread_one_as_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: singlethread_one_as_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("singlethread-one-as") seq_cst ret void @@ -521,6 +554,10 @@ define amdgpu_kernel void @wavefront_acquire_fence() { ; GFX12-CU-LABEL: wavefront_acquire_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: wavefront_acquire_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("wavefront") acquire ret void @@ -578,6 +615,10 @@ define amdgpu_kernel void @wavefront_release_fence() { ; GFX12-CU-LABEL: wavefront_release_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: wavefront_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("wavefront") release ret void @@ -635,6 +676,10 @@ define amdgpu_kernel void @wavefront_acq_rel_fence() { ; GFX12-CU-LABEL: wavefront_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: wavefront_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("wavefront") acq_rel ret void @@ -692,6 +737,10 @@ define amdgpu_kernel void @wavefront_seq_cst_fence() { ; GFX12-CU-LABEL: wavefront_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: wavefront_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("wavefront") seq_cst ret void @@ -749,6 +798,10 @@ define amdgpu_kernel void @wavefront_one_as_acquire_fence() { ; GFX12-CU-LABEL: wavefront_one_as_acquire_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: wavefront_one_as_acquire_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("wavefront-one-as") acquire ret void @@ -806,6 +859,10 @@ define amdgpu_kernel void @wavefront_one_as_release_fence() { ; GFX12-CU-LABEL: wavefront_one_as_release_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: wavefront_one_as_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("wavefront-one-as") release ret void @@ -863,6 +920,10 @@ define amdgpu_kernel void @wavefront_one_as_acq_rel_fence() { ; GFX12-CU-LABEL: wavefront_one_as_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: wavefront_one_as_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("wavefront-one-as") acq_rel ret void @@ -920,6 +981,10 @@ define amdgpu_kernel void @wavefront_one_as_seq_cst_fence() { ; GFX12-CU-LABEL: wavefront_one_as_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: wavefront_one_as_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("wavefront-one-as") seq_cst ret void @@ -998,6 +1063,11 @@ define amdgpu_kernel void @workgroup_acquire_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: workgroup_acquire_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup") acquire ret void @@ -1073,6 +1143,11 @@ define amdgpu_kernel void @workgroup_release_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: workgroup_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup") release ret void @@ -1153,6 +1228,11 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: workgroup_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup") acq_rel ret void @@ -1233,6 +1313,11 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: workgroup_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup") seq_cst ret void @@ -1303,6 +1388,10 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() { ; GFX12-CU-LABEL: workgroup_one_as_acquire_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: workgroup_one_as_acquire_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") acquire ret void @@ -1370,6 +1459,10 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() { ; GFX12-CU-LABEL: workgroup_one_as_release_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: workgroup_one_as_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") release ret void @@ -1442,6 +1535,10 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; GFX12-CU-LABEL: workgroup_one_as_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: workgroup_one_as_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") acq_rel ret void @@ -1514,6 +1611,10 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; GFX12-CU-LABEL: workgroup_one_as_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: workgroup_one_as_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") seq_cst ret void @@ -1606,6 +1707,13 @@ define amdgpu_kernel void @agent_acquire_fence() { ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: agent_acquire_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent") acquire ret void @@ -1688,6 +1796,14 @@ define amdgpu_kernel void @agent_release_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: agent_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent") release ret void @@ -1786,6 +1902,15 @@ define amdgpu_kernel void @agent_acq_rel_fence() { ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: agent_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent") acq_rel ret void @@ -1884,6 +2009,15 @@ define amdgpu_kernel void @agent_seq_cst_fence() { ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: agent_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent") seq_cst ret void @@ -1976,6 +2110,13 @@ define amdgpu_kernel void @agent_one_as_acquire_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: agent_one_as_acquire_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent-one-as") acquire ret void @@ -2058,6 +2199,14 @@ define amdgpu_kernel void @agent_one_as_release_fence() { ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: agent_one_as_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent-one-as") release ret void @@ -2156,6 +2305,15 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: agent_one_as_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent-one-as") acq_rel ret void @@ -2254,6 +2412,15 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: agent_one_as_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent-one-as") seq_cst ret void @@ -2348,6 +2515,13 @@ define amdgpu_kernel void @system_acquire_fence() { ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: system_acquire_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm entry: fence acquire ret void @@ -2434,6 +2608,15 @@ define amdgpu_kernel void @system_release_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: system_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence release ret void @@ -2538,6 +2721,16 @@ define amdgpu_kernel void @system_acq_rel_fence() { ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: system_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm entry: fence acq_rel ret void @@ -2642,6 +2835,16 @@ define amdgpu_kernel void @system_seq_cst_fence() { ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: system_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm entry: fence seq_cst ret void @@ -2736,6 +2939,13 @@ define amdgpu_kernel void @system_one_as_acquire_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: system_one_as_acquire_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("one-as") acquire ret void @@ -2822,6 +3032,15 @@ define amdgpu_kernel void @system_one_as_release_fence() { ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: system_one_as_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("one-as") release ret void @@ -2926,6 +3145,16 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: system_one_as_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("one-as") acq_rel ret void @@ -3030,6 +3259,16 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: system_one_as_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("one-as") seq_cst ret void diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll index 07ad8cb0c4a3d..05b599c6bc1c7 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll @@ -11,6 +11,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s define amdgpu_kernel void @flat_agent_unordered_load( ; GFX7-LABEL: flat_agent_unordered_load: @@ -187,6 +188,17 @@ define amdgpu_kernel void @flat_agent_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("agent") unordered, align 4 @@ -369,6 +381,17 @@ define amdgpu_kernel void @flat_agent_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("agent") monotonic, align 4 @@ -566,6 +589,18 @@ define amdgpu_kernel void @flat_agent_acquire_load( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("agent") acquire, align 4 @@ -789,6 +824,24 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("agent") seq_cst, align 4 @@ -939,6 +992,16 @@ define amdgpu_kernel void @flat_agent_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("agent") unordered, align 4 @@ -1088,6 +1151,16 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("agent") monotonic, align 4 @@ -1261,6 +1334,20 @@ define amdgpu_kernel void @flat_agent_release_store( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("agent") release, align 4 @@ -1434,6 +1521,20 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("agent") seq_cst, align 4 @@ -1583,6 +1684,16 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") monotonic @@ -1763,6 +1874,18 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") acquire @@ -1936,6 +2059,20 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") release @@ -2140,6 +2277,22 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") acq_rel @@ -2344,6 +2497,22 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") seq_cst @@ -2552,6 +2721,19 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") acquire @@ -2789,6 +2971,25 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") acq_rel @@ -3026,6 +3227,25 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") seq_cst @@ -3264,6 +3484,20 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3533,6 +3767,22 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3795,6 +4045,24 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4088,6 +4356,26 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4381,6 +4669,26 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4650,6 +4958,22 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4919,6 +5243,22 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5212,6 +5552,26 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5505,6 +5865,26 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5798,6 +6178,26 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6091,6 +6491,26 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6384,6 +6804,26 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6677,6 +7117,26 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6970,6 +7430,26 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7263,6 +7743,26 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7545,6 +8045,22 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7844,6 +8360,23 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8152,6 +8685,26 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8479,6 +9032,29 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8806,6 +9382,29 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9109,6 +9708,25 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9408,6 +10026,23 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9735,6 +10370,29 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10062,6 +10720,29 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10389,6 +11070,29 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10716,6 +11420,29 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -11039,6 +11766,27 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -11366,6 +12114,29 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -11693,6 +12464,29 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -12020,6 +12814,29 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -12204,6 +13021,17 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("agent-one-as") unordered, align 4 @@ -12386,6 +13214,17 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("agent-one-as") monotonic, align 4 @@ -12593,6 +13432,19 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("agent-one-as") acquire, align 4 @@ -12826,6 +13678,25 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("agent-one-as") seq_cst, align 4 @@ -12976,6 +13847,16 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("agent-one-as") unordered, align 4 @@ -13125,6 +14006,16 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("agent-one-as") monotonic, align 4 @@ -13298,6 +14189,20 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("agent-one-as") release, align 4 @@ -13471,6 +14376,20 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("agent-one-as") seq_cst, align 4 @@ -13620,6 +14539,16 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") monotonic @@ -13796,6 +14725,18 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") acquire @@ -13969,6 +14910,20 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") release @@ -14169,6 +15124,22 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") acq_rel @@ -14369,6 +15340,22 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") seq_cst @@ -14587,6 +15574,20 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") acquire @@ -14834,6 +15835,26 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") acq_rel @@ -15081,6 +16102,26 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") seq_cst @@ -15319,6 +16360,20 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15584,6 +16639,22 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15846,6 +16917,24 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16135,6 +17224,26 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16424,6 +17533,26 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16689,6 +17818,22 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16954,6 +18099,22 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17243,6 +18404,26 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17532,6 +18713,26 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17821,6 +19022,26 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18110,6 +19331,26 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18399,6 +19640,26 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18688,6 +19949,26 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18977,6 +20258,26 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19266,6 +20567,26 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19548,6 +20869,22 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19857,6 +21194,24 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20165,6 +21520,26 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20502,6 +21877,30 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20839,6 +22238,30 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21152,6 +22575,26 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21461,6 +22904,24 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21798,6 +23259,30 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -22135,6 +23620,30 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -22472,6 +23981,30 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -22809,6 +24342,30 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -23142,6 +24699,28 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -23479,6 +25058,30 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -23816,6 +25419,30 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -24153,6 +25780,30 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll index a00af8e5b6582..0b5e6e9da7418 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefix=GFX12 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefix=GFX12 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s define amdgpu_kernel void @flat_last_use_load_0(ptr %in, ptr %out) { ; GFX12-LABEL: flat_last_use_load_0: @@ -16,6 +17,17 @@ define amdgpu_kernel void @flat_last_use_load_0(ptr %in, ptr %out) { ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_last_use_load_0: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_LU +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm entry: %val = load i32, ptr %in, align 4, !amdgpu.last.use !{} store i32 %val, ptr %out @@ -55,6 +67,21 @@ define amdgpu_kernel void @flat_last_use_load_1(ptr %in, ptr %out) { ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_last_use_load_1: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_mov_b32 s4, 0x3ff +; GFX1250-CU-NEXT: v_and_b32_e64 v1, v1, s4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v1, s[2:3] scale_offset th:TH_LOAD_LU +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %val.gep = getelementptr inbounds i32, ptr %in, i32 %tid @@ -80,6 +107,19 @@ define amdgpu_kernel void @flat_last_use_and_volatile_load(ptr %in, ptr %out) { ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_last_use_and_volatile_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm entry: %val = load volatile i32, ptr %in, align 4, !amdgpu.last.use !{} store i32 %val, ptr %out @@ -100,6 +140,17 @@ define amdgpu_kernel void @flat_last_use_and_nontemporal_load(ptr %in, ptr %out) ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_last_use_and_nontemporal_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_LU +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm entry: %val = load i32, ptr %in, align 4, !amdgpu.last.use !{}, !nontemporal !0 store i32 %val, ptr %out diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll index 3c24c36ec547d..58f33dfed87d6 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll @@ -11,6 +11,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s define amdgpu_kernel void @flat_nontemporal_load_0( ; GFX7-LABEL: flat_nontemporal_load_0: @@ -187,6 +188,17 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_nontemporal_load_0: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load i32, ptr %in, align 4, !nontemporal !0 @@ -555,6 +567,21 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_nontemporal_load_1: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_mov_b32 s4, 0x3ff +; GFX1250-CU-NEXT: v_and_b32_e64 v1, v1, s4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v1, s[2:3] scale_offset th:TH_LOAD_NT +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -739,6 +766,17 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 th:TH_STORE_NT ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_nontemporal_store_0: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] th:TH_STORE_NT scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load i32, ptr %in, align 4 @@ -1095,6 +1133,20 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 th:TH_STORE_NT ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_nontemporal_store_1: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v1, s[2:3] +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_mov_b32 s2, 0x3ff +; GFX1250-CU-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scale_offset th:TH_STORE_NT scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1293,6 +1345,19 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_nontemporal_volatile_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load volatile i32, ptr %in, align 4, !nontemporal !0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll index b88a10ab24a98..ed90e278d1e86 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll @@ -11,6 +11,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s define amdgpu_kernel void @flat_singlethread_unordered_load( ; GFX7-LABEL: flat_singlethread_unordered_load: @@ -187,6 +188,17 @@ define amdgpu_kernel void @flat_singlethread_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("singlethread") unordered, align 4 @@ -369,6 +381,17 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("singlethread") monotonic, align 4 @@ -551,6 +574,17 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("singlethread") acquire, align 4 @@ -733,6 +767,17 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("singlethread") seq_cst, align 4 @@ -883,6 +928,16 @@ define amdgpu_kernel void @flat_singlethread_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("singlethread") unordered, align 4 @@ -1032,6 +1087,16 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("singlethread") monotonic, align 4 @@ -1181,6 +1246,16 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("singlethread") release, align 4 @@ -1330,6 +1405,16 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("singlethread") seq_cst, align 4 @@ -1479,6 +1564,16 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") monotonic @@ -1628,6 +1723,16 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acquire @@ -1777,6 +1882,16 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") release @@ -1926,6 +2041,16 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acq_rel @@ -2075,6 +2200,16 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") seq_cst @@ -2268,6 +2403,18 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acquire @@ -2462,6 +2609,18 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acq_rel @@ -2656,6 +2815,18 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") seq_cst @@ -2894,6 +3065,20 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3132,6 +3317,20 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3370,6 +3569,20 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3608,6 +3821,20 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3846,6 +4073,20 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4084,6 +4325,20 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4322,6 +4577,20 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4560,6 +4829,20 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4798,6 +5081,20 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5036,6 +5333,20 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5274,6 +5585,20 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5512,6 +5837,20 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5750,6 +6089,20 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5988,6 +6341,20 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6226,6 +6593,20 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6508,6 +6889,22 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6792,6 +7189,22 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7076,6 +7489,22 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7360,6 +7789,22 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7644,6 +8089,22 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7928,6 +8389,22 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8212,6 +8689,22 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8496,6 +8989,22 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8780,6 +9289,22 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9064,6 +9589,22 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9348,6 +9889,22 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9632,6 +10189,22 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9916,6 +10489,22 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10200,6 +10789,22 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10484,6 +11089,22 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10668,6 +11289,17 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("singlethread-one-as") unordered, align 4 @@ -10850,6 +11482,17 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("singlethread-one-as") monotonic, align 4 @@ -11032,6 +11675,17 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("singlethread-one-as") acquire, align 4 @@ -11214,6 +11868,17 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("singlethread-one-as") seq_cst, align 4 @@ -11364,6 +12029,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("singlethread-one-as") unordered, align 4 @@ -11513,6 +12188,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("singlethread-one-as") monotonic, align 4 @@ -11662,6 +12347,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("singlethread-one-as") release, align 4 @@ -11811,6 +12506,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("singlethread-one-as") seq_cst, align 4 @@ -11960,6 +12665,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") monotonic @@ -12109,6 +12824,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acquire @@ -12258,6 +12983,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") release @@ -12407,6 +13142,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acq_rel @@ -12556,6 +13301,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") seq_cst @@ -12749,6 +13504,18 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acquire @@ -12943,6 +13710,18 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acq_rel @@ -13137,6 +13916,18 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") seq_cst @@ -13375,6 +14166,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -13613,6 +14418,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -13851,6 +14670,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14089,6 +14922,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14327,6 +15174,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14565,6 +15426,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14803,6 +15678,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15041,6 +15930,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15279,6 +16182,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15517,6 +16434,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15755,6 +16686,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15993,6 +16938,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16231,6 +17190,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16469,6 +17442,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16707,6 +17694,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16989,6 +17990,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17273,6 +18290,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17557,6 +18590,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17841,6 +18890,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18125,6 +19190,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18409,6 +19490,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18693,6 +19790,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18977,6 +20090,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19261,6 +20390,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19545,6 +20690,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19829,6 +20990,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20113,6 +21290,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20397,6 +21590,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20681,6 +21890,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20965,6 +22190,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll index 919fc3e8f4e4f..3c6ffdd1a6332 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll @@ -11,6 +11,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s define amdgpu_kernel void @flat_system_unordered_load( ; GFX7-LABEL: flat_system_unordered_load: @@ -187,6 +188,17 @@ define amdgpu_kernel void @flat_system_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in unordered, align 4 @@ -369,6 +381,17 @@ define amdgpu_kernel void @flat_system_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in monotonic, align 4 @@ -568,6 +591,18 @@ define amdgpu_kernel void @flat_system_acquire_load( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in acquire, align 4 @@ -793,6 +828,24 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in seq_cst, align 4 @@ -943,6 +996,16 @@ define amdgpu_kernel void @flat_system_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out unordered, align 4 @@ -1092,6 +1155,16 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out monotonic, align 4 @@ -1269,6 +1342,21 @@ define amdgpu_kernel void @flat_system_release_store( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out release, align 4 @@ -1446,6 +1534,21 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out seq_cst, align 4 @@ -1595,6 +1698,16 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in monotonic @@ -1777,6 +1890,18 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in acquire @@ -1954,6 +2079,21 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in release @@ -2164,6 +2304,23 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in acq_rel @@ -2374,6 +2531,23 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in seq_cst @@ -2584,6 +2758,19 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in acquire @@ -2827,6 +3014,26 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in acq_rel @@ -3070,6 +3277,26 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in seq_cst @@ -3308,6 +3535,20 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3579,6 +3820,22 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3845,6 +4102,25 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4144,6 +4420,27 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4443,6 +4740,27 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4714,6 +5032,22 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4985,6 +5319,22 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5284,6 +5634,27 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5583,6 +5954,27 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5882,6 +6274,27 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6181,6 +6594,27 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6480,6 +6914,27 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6779,6 +7234,27 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7078,6 +7554,27 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7377,6 +7874,27 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7659,6 +8177,22 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7960,6 +8494,23 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8272,6 +8823,27 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8605,6 +9177,30 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8938,6 +9534,30 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9243,6 +9863,25 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9544,6 +10183,23 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9877,6 +10533,30 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10210,6 +10890,30 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10543,6 +11247,30 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10876,6 +11604,30 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -11205,6 +11957,28 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -11538,6 +12312,30 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -11871,6 +12669,30 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -12204,6 +13026,30 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -12388,6 +13234,17 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("one-as") unordered, align 4 @@ -12570,6 +13427,17 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("one-as") monotonic, align 4 @@ -12779,6 +13647,19 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("one-as") acquire, align 4 @@ -13014,6 +13895,25 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("one-as") seq_cst, align 4 @@ -13164,6 +14064,16 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("one-as") unordered, align 4 @@ -13313,6 +14223,16 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("one-as") monotonic, align 4 @@ -13490,6 +14410,21 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("one-as") release, align 4 @@ -13667,6 +14602,21 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("one-as") seq_cst, align 4 @@ -13816,6 +14766,16 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") monotonic @@ -13994,6 +14954,18 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acquire @@ -14171,6 +15143,21 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") release @@ -14377,6 +15364,23 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acq_rel @@ -14583,6 +15587,23 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") seq_cst @@ -14803,6 +15824,20 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acquire @@ -15056,6 +16091,27 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acq_rel @@ -15309,6 +16365,27 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") seq_cst @@ -15547,6 +16624,20 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15814,6 +16905,22 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16080,6 +17187,25 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16375,6 +17501,27 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16670,6 +17817,27 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16937,6 +18105,22 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17204,6 +18388,22 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17499,6 +18699,27 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17794,6 +19015,27 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18089,6 +19331,27 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18384,6 +19647,27 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18679,6 +19963,27 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18974,6 +20279,27 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19269,6 +20595,27 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19564,6 +20911,27 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19846,6 +21214,22 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20157,6 +21541,24 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20469,6 +21871,27 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20812,6 +22235,31 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21155,6 +22603,31 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21470,6 +22943,26 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21781,6 +23274,24 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -22124,6 +23635,31 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -22467,6 +24003,31 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -22810,6 +24371,31 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -23153,6 +24739,31 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -23492,6 +25103,29 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -23835,6 +25469,31 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -24178,6 +25837,31 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -24521,6 +26205,31 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll index a88e0e217fdb4..e0cf88891e421 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll @@ -7,6 +7,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s define amdgpu_kernel void @flat_nontemporal_load_0( ; GFX7-LABEL: flat_nontemporal_load_0: @@ -143,6 +144,19 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_nontemporal_load_0: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load volatile i32, ptr %in, align 4 @@ -415,6 +429,23 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_nontemporal_load_1: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_mov_b32 s4, 0x3ff +; GFX1250-CU-NEXT: v_and_b32_e64 v1, v1, s4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v1, s[2:3] scale_offset scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -563,6 +594,18 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_nontemporal_store_0: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load i32, ptr %in, align 4 @@ -831,6 +874,21 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_nontemporal_store_1: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v1, s[2:3] +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_mov_b32 s2, 0x3ff +; GFX1250-CU-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scale_offset scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -971,6 +1029,17 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_volatile_workgroup_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic volatile i32, ptr %in syncscope("workgroup") acquire, align 4 @@ -1090,6 +1159,17 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_volatile_workgroup_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic volatile i32 %in, ptr %out syncscope("workgroup") release, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll index 7c637a20ab47b..16135f5fc5d6f 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll @@ -11,6 +11,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s define amdgpu_kernel void @flat_wavefront_unordered_load( ; GFX7-LABEL: flat_wavefront_unordered_load: @@ -187,6 +188,17 @@ define amdgpu_kernel void @flat_wavefront_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("wavefront") unordered, align 4 @@ -369,6 +381,17 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("wavefront") monotonic, align 4 @@ -551,6 +574,17 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("wavefront") acquire, align 4 @@ -733,6 +767,17 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("wavefront") seq_cst, align 4 @@ -883,6 +928,16 @@ define amdgpu_kernel void @flat_wavefront_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("wavefront") unordered, align 4 @@ -1032,6 +1087,16 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("wavefront") monotonic, align 4 @@ -1181,6 +1246,16 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("wavefront") release, align 4 @@ -1330,6 +1405,16 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("wavefront") seq_cst, align 4 @@ -1479,6 +1564,16 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") monotonic @@ -1628,6 +1723,16 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acquire @@ -1777,6 +1882,16 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") release @@ -1926,6 +2041,16 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acq_rel @@ -2075,6 +2200,16 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") seq_cst @@ -2268,6 +2403,18 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acquire @@ -2462,6 +2609,18 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acq_rel @@ -2656,6 +2815,18 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") seq_cst @@ -2894,6 +3065,20 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3132,6 +3317,20 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3370,6 +3569,20 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3608,6 +3821,20 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3846,6 +4073,20 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4084,6 +4325,20 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4322,6 +4577,20 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4560,6 +4829,20 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4798,6 +5081,20 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5036,6 +5333,20 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5274,6 +5585,20 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5512,6 +5837,20 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5750,6 +6089,20 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5988,6 +6341,20 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6226,6 +6593,20 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6508,6 +6889,22 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6792,6 +7189,22 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7076,6 +7489,22 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7360,6 +7789,22 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7644,6 +8089,22 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7928,6 +8389,22 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8212,6 +8689,22 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8496,6 +8989,22 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8780,6 +9289,22 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9064,6 +9589,22 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9348,6 +9889,22 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9632,6 +10189,22 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9916,6 +10489,22 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10200,6 +10789,22 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10484,6 +11089,22 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10668,6 +11289,17 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("wavefront-one-as") unordered, align 4 @@ -10850,6 +11482,17 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("wavefront-one-as") monotonic, align 4 @@ -11032,6 +11675,17 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("wavefront-one-as") acquire, align 4 @@ -11214,6 +11868,17 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("wavefront-one-as") seq_cst, align 4 @@ -11364,6 +12029,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("wavefront-one-as") unordered, align 4 @@ -11513,6 +12188,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("wavefront-one-as") monotonic, align 4 @@ -11662,6 +12347,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("wavefront-one-as") release, align 4 @@ -11811,6 +12506,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("wavefront-one-as") seq_cst, align 4 @@ -11960,6 +12665,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") monotonic @@ -12109,6 +12824,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acquire @@ -12258,6 +12983,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") release @@ -12407,6 +13142,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acq_rel @@ -12556,6 +13301,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") seq_cst @@ -12749,6 +13504,18 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acquire @@ -12943,6 +13710,18 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acq_rel @@ -13137,6 +13916,18 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") seq_cst @@ -13375,6 +14166,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -13613,6 +14418,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -13851,6 +14670,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14089,6 +14922,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14327,6 +15174,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14565,6 +15426,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14803,6 +15678,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15041,6 +15930,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15279,6 +16182,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15517,6 +16434,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15755,6 +16686,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15993,6 +16938,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16231,6 +17190,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16469,6 +17442,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16707,6 +17694,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16989,6 +17990,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17273,6 +18290,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17557,6 +18590,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17841,6 +18890,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18125,6 +19190,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18409,6 +19490,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18693,6 +19790,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18977,6 +20090,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19261,6 +20390,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19545,6 +20690,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19829,6 +20990,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20113,6 +21290,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20397,6 +21590,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20681,6 +21890,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll index 0fd4aa4a7a93f..18e6812ded962 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll @@ -11,6 +11,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s define amdgpu_kernel void @flat_workgroup_unordered_load( ; GFX7-LABEL: flat_workgroup_unordered_load: @@ -187,6 +188,17 @@ define amdgpu_kernel void @flat_workgroup_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("workgroup") unordered, align 4 @@ -369,6 +381,17 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("workgroup") monotonic, align 4 @@ -563,6 +586,17 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("workgroup") acquire, align 4 @@ -776,6 +810,18 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("workgroup") seq_cst, align 4 @@ -926,6 +972,16 @@ define amdgpu_kernel void @flat_workgroup_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("workgroup") unordered, align 4 @@ -1075,6 +1131,16 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("workgroup") monotonic, align 4 @@ -1241,6 +1307,17 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("workgroup") release, align 4 @@ -1407,6 +1484,17 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("workgroup") seq_cst, align 4 @@ -1556,6 +1644,16 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") monotonic @@ -1724,6 +1822,17 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acquire @@ -1890,6 +1999,17 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") release @@ -2075,6 +2195,18 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acq_rel @@ -2260,6 +2392,18 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") seq_cst @@ -2465,6 +2609,18 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acquire @@ -2690,6 +2846,19 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acq_rel @@ -2915,6 +3084,19 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") seq_cst @@ -3153,6 +3335,20 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3410,6 +3606,21 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3665,6 +3876,21 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3939,6 +4165,22 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4213,6 +4455,22 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4470,6 +4728,21 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4727,6 +5000,21 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5001,6 +5289,22 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5275,6 +5579,22 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5549,6 +5869,22 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5823,6 +6159,22 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6105,6 +6457,22 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6401,6 +6769,22 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6702,6 +7086,23 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7017,6 +7418,23 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7332,6 +7750,23 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7630,6 +8065,22 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7926,6 +8377,22 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8241,6 +8708,23 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8556,6 +9040,23 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8871,6 +9372,23 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9186,6 +9704,23 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9499,6 +10034,23 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9814,6 +10366,23 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10129,6 +10698,23 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10444,6 +11030,23 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10628,6 +11231,17 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("workgroup-one-as") unordered, align 4 @@ -10810,6 +11424,17 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("workgroup-one-as") monotonic, align 4 @@ -11000,6 +11625,17 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("workgroup-one-as") acquire, align 4 @@ -11202,6 +11838,17 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("workgroup-one-as") seq_cst, align 4 @@ -11352,6 +11999,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("workgroup-one-as") unordered, align 4 @@ -11501,6 +12158,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("workgroup-one-as") monotonic, align 4 @@ -11660,6 +12327,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("workgroup-one-as") release, align 4 @@ -11819,6 +12496,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("workgroup-one-as") seq_cst, align 4 @@ -11968,6 +12655,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") monotonic @@ -12127,6 +12824,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acquire @@ -12286,6 +12993,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") release @@ -12455,6 +13172,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acq_rel @@ -12624,6 +13351,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") seq_cst @@ -12825,6 +13562,18 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acquire @@ -13039,6 +13788,18 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acq_rel @@ -13253,6 +14014,18 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") seq_cst @@ -13491,6 +14264,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -13739,6 +14526,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -13987,6 +14788,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14245,6 +15060,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14503,6 +15332,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14751,6 +15594,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14999,6 +15856,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15257,6 +16128,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15515,6 +16400,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15773,6 +16672,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16031,6 +16944,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16289,6 +17216,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16547,6 +17488,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16805,6 +17760,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17063,6 +18032,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17345,6 +18328,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17637,6 +18636,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17931,6 +18946,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18235,6 +19266,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18539,6 +19586,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18833,6 +19896,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19125,6 +20204,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19429,6 +20524,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19733,6 +20844,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20037,6 +21164,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20341,6 +21484,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20643,6 +21802,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20947,6 +22122,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21251,6 +22442,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21555,6 +22762,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll index 74a72e04fa4ae..51859c112bf9f 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll @@ -12,6 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s define amdgpu_kernel void @global_agent_unordered_load( ; GFX6-LABEL: global_agent_unordered_load: @@ -190,6 +191,17 @@ define amdgpu_kernel void @global_agent_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("agent") unordered, align 4 @@ -374,6 +386,17 @@ define amdgpu_kernel void @global_agent_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("agent") monotonic, align 4 @@ -574,6 +597,18 @@ define amdgpu_kernel void @global_agent_acquire_load( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("agent") acquire, align 4 @@ -793,6 +828,24 @@ define amdgpu_kernel void @global_agent_seq_cst_load( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("agent") seq_cst, align 4 @@ -950,6 +1003,16 @@ define amdgpu_kernel void @global_agent_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") unordered, align 4 @@ -1106,6 +1169,16 @@ define amdgpu_kernel void @global_agent_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") monotonic, align 4 @@ -1287,6 +1360,20 @@ define amdgpu_kernel void @global_agent_release_store( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") release, align 4 @@ -1468,6 +1555,20 @@ define amdgpu_kernel void @global_agent_seq_cst_store( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") seq_cst, align 4 @@ -1622,6 +1723,16 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") monotonic @@ -1805,6 +1916,18 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") acquire @@ -1984,6 +2107,20 @@ define amdgpu_kernel void @global_agent_release_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") release @@ -2192,6 +2329,22 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") acq_rel @@ -2400,6 +2553,22 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst @@ -2598,6 +2767,19 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") acquire @@ -2826,6 +3008,25 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") acq_rel @@ -3054,6 +3255,25 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst @@ -3273,6 +3493,20 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3521,6 +3755,22 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3765,6 +4015,24 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4038,6 +4306,26 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4311,6 +4599,26 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4559,6 +4867,22 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4807,6 +5131,22 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5080,6 +5420,26 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5353,6 +5713,26 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5626,6 +6006,26 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5899,6 +6299,26 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6172,6 +6592,26 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6445,6 +6885,26 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6718,6 +7178,26 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6991,6 +7471,26 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7240,6 +7740,22 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7507,6 +8023,23 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7783,6 +8316,26 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8079,6 +8632,29 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8375,6 +8951,29 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8646,6 +9245,25 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8913,6 +9531,23 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9209,6 +9844,29 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9505,6 +10163,29 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9801,6 +10482,29 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10097,6 +10801,29 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10389,6 +11116,27 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10685,6 +11433,29 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10981,6 +11752,29 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -11277,6 +12071,29 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -11463,6 +12280,17 @@ define amdgpu_kernel void @global_agent_one_as_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("agent-one-as") unordered, align 4 @@ -11647,6 +12475,17 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("agent-one-as") monotonic, align 4 @@ -11847,6 +12686,18 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("agent-one-as") acquire, align 4 @@ -12066,6 +12917,24 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("agent-one-as") seq_cst, align 4 @@ -12223,6 +13092,16 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("agent-one-as") unordered, align 4 @@ -12379,6 +13258,16 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("agent-one-as") monotonic, align 4 @@ -12560,6 +13449,20 @@ define amdgpu_kernel void @global_agent_one_as_release_store( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("agent-one-as") release, align 4 @@ -12741,6 +13644,20 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("agent-one-as") seq_cst, align 4 @@ -12895,6 +13812,16 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") monotonic @@ -13078,6 +14005,18 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") acquire @@ -13257,6 +14196,20 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") release @@ -13465,6 +14418,22 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") acq_rel @@ -13673,6 +14642,22 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") seq_cst @@ -13871,6 +14856,19 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") acquire @@ -14099,6 +15097,25 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") acq_rel @@ -14327,6 +15344,25 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") seq_cst @@ -14546,6 +15582,20 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14794,6 +15844,22 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15038,6 +16104,24 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15311,6 +16395,26 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15584,6 +16688,26 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15832,6 +16956,22 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16080,6 +17220,22 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16353,6 +17509,26 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16626,6 +17802,26 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16899,6 +18095,26 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17172,6 +18388,26 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17445,6 +18681,26 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17718,6 +18974,26 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17991,6 +19267,26 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18264,6 +19560,26 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18513,6 +19829,22 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18780,6 +20112,23 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19076,6 +20425,29 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19372,6 +20744,29 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19643,6 +21038,25 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19910,6 +21324,23 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20206,6 +21637,29 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20502,6 +21956,29 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20798,6 +22275,29 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -21094,6 +22594,29 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -21386,6 +22909,27 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -21682,6 +23226,29 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -21978,6 +23545,29 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -22274,6 +23864,29 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll index 5f952b98041f3..2c8fa9da74862 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefix=GFX12 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefix=GFX12 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s define amdgpu_kernel void @global_last_use_load_0(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; GFX12-LABEL: global_last_use_load_0: @@ -14,6 +15,18 @@ define amdgpu_kernel void @global_last_use_load_0(ptr addrspace(1) %in, ptr addr ; GFX12-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_last_use_load_0: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm entry: %val = load i32, ptr addrspace(1) %in, align 4, !amdgpu.last.use !{} store i32 %val, ptr addrspace(1) %out @@ -37,6 +50,21 @@ define amdgpu_kernel void @global_last_use_load_1(ptr addrspace(1) %in, ptr addr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_last_use_load_1: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_mov_b32 s4, 0x3ff +; GFX1250-CU-NEXT: v_and_b32_e64 v1, v1, s4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v1, s[2:3] scale_offset th:TH_LOAD_LU +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %val.gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid @@ -58,6 +86,19 @@ define amdgpu_kernel void @global_last_use_and_volatile_load(ptr addrspace(1) %i ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_last_use_and_volatile_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm entry: %val = load volatile i32, ptr addrspace(1) %in, align 4, !amdgpu.last.use !{} store i32 %val, ptr addrspace(1) %out @@ -81,6 +122,21 @@ define amdgpu_kernel void @global_last_use_and_nontemporal_load(ptr addrspace(1) ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_last_use_and_nontemporal_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_mov_b32 s4, 0x3ff +; GFX1250-CU-NEXT: v_and_b32_e64 v1, v1, s4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v1, s[2:3] scale_offset th:TH_LOAD_LU +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %val.gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll index 16e55058e4fc8..e73300dbc5ac6 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll @@ -12,6 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s define amdgpu_kernel void @global_nontemporal_load_0( ; GFX6-LABEL: global_nontemporal_load_0: @@ -189,6 +190,18 @@ define amdgpu_kernel void @global_nontemporal_load_0( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_nontemporal_load_0: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load i32, ptr addrspace(1) %in, align 4, !nontemporal !0 @@ -448,6 +461,21 @@ define amdgpu_kernel void @global_nontemporal_load_1( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_nontemporal_load_1: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_mov_b32 s4, 0x3ff +; GFX1250-CU-NEXT: v_and_b32_e64 v1, v1, s4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v1, s[2:3] scale_offset th:TH_LOAD_NT +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -633,6 +661,18 @@ define amdgpu_kernel void @global_nontemporal_store_0( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] th:TH_STORE_NT ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_nontemporal_store_0: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] th:TH_STORE_NT +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load i32, ptr addrspace(1) %in, align 4 @@ -866,6 +906,20 @@ define amdgpu_kernel void @global_nontemporal_store_1( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] th:TH_STORE_NT ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_nontemporal_store_1: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_mov_b32 s3, 0x3ff +; GFX1250-CU-NEXT: v_and_b32_e64 v0, v0, s3 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset th:TH_STORE_NT +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1056,6 +1110,19 @@ define amdgpu_kernel void @global_nontemporal_volatile_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_nontemporal_volatile_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load volatile i32, ptr addrspace(1) %in, align 4, !nontemporal !0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll index 8042d38716107..2633bba70ddd3 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll @@ -12,6 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s define amdgpu_kernel void @global_singlethread_unordered_load( ; GFX6-LABEL: global_singlethread_unordered_load: @@ -190,6 +191,17 @@ define amdgpu_kernel void @global_singlethread_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread") unordered, align 4 @@ -374,6 +386,17 @@ define amdgpu_kernel void @global_singlethread_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread") monotonic, align 4 @@ -558,6 +581,17 @@ define amdgpu_kernel void @global_singlethread_acquire_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread") acquire, align 4 @@ -742,6 +776,17 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread") seq_cst, align 4 @@ -899,6 +944,16 @@ define amdgpu_kernel void @global_singlethread_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread") unordered, align 4 @@ -1055,6 +1110,16 @@ define amdgpu_kernel void @global_singlethread_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread") monotonic, align 4 @@ -1211,6 +1276,16 @@ define amdgpu_kernel void @global_singlethread_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread") release, align 4 @@ -1367,6 +1442,16 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread") seq_cst, align 4 @@ -1521,6 +1606,16 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") monotonic @@ -1675,6 +1770,16 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") acquire @@ -1829,6 +1934,16 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") release @@ -1983,6 +2098,16 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") acq_rel @@ -2137,6 +2262,16 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") seq_cst @@ -2319,6 +2454,18 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") acquire @@ -2502,6 +2649,18 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") acq_rel @@ -2685,6 +2844,18 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") seq_cst @@ -2904,6 +3075,20 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3123,6 +3308,20 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3342,6 +3541,20 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3561,6 +3774,20 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3780,6 +4007,20 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3999,6 +4240,20 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4218,6 +4473,20 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4437,6 +4706,20 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4656,6 +4939,20 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4875,6 +5172,20 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5094,6 +5405,20 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5313,6 +5638,20 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5532,6 +5871,20 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5751,6 +6104,20 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5970,6 +6337,20 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6219,6 +6600,22 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6470,6 +6867,22 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6721,6 +7134,22 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6972,6 +7401,22 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7223,6 +7668,22 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7474,6 +7935,22 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7725,6 +8202,22 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7976,6 +8469,22 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8227,6 +8736,22 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8478,6 +9003,22 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8729,6 +9270,22 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8980,6 +9537,22 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9231,6 +9804,22 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9482,6 +10071,22 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9733,6 +10338,22 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9919,6 +10540,17 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread-one-as") unordered, align 4 @@ -10103,6 +10735,17 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread-one-as") monotonic, align 4 @@ -10287,6 +10930,17 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread-one-as") acquire, align 4 @@ -10471,6 +11125,17 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread-one-as") seq_cst, align 4 @@ -10628,6 +11293,16 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread-one-as") unordered, align 4 @@ -10784,6 +11459,16 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread-one-as") monotonic, align 4 @@ -10940,6 +11625,16 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread-one-as") release, align 4 @@ -11096,6 +11791,16 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread-one-as") seq_cst, align 4 @@ -11250,6 +11955,16 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") monotonic @@ -11404,6 +12119,16 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") acquire @@ -11558,6 +12283,16 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") release @@ -11712,6 +12447,16 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") acq_rel @@ -11866,6 +12611,16 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") seq_cst @@ -12048,6 +12803,18 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") acquire @@ -12231,6 +12998,18 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") acq_rel @@ -12414,6 +13193,18 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") seq_cst @@ -12633,6 +13424,20 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -12852,6 +13657,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -13071,6 +13890,20 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -13290,6 +14123,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -13509,6 +14356,20 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -13728,6 +14589,20 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -13947,6 +14822,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14166,6 +15055,20 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14385,6 +15288,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14604,6 +15521,20 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14823,6 +15754,20 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15042,6 +15987,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15261,6 +16220,20 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15480,6 +16453,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15699,6 +16686,20 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15948,6 +16949,22 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16199,6 +17216,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16450,6 +17483,22 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16701,6 +17750,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16952,6 +18017,22 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17203,6 +18284,22 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17454,6 +18551,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17705,6 +18818,22 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17956,6 +19085,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18207,6 +19352,22 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18458,6 +19619,22 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18709,6 +19886,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18960,6 +20153,22 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19211,6 +20420,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19462,6 +20687,22 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll index be148464c156e..c194b49f25255 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll @@ -12,6 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s define amdgpu_kernel void @global_system_unordered_load( ; GFX6-LABEL: global_system_unordered_load: @@ -190,6 +191,17 @@ define amdgpu_kernel void @global_system_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in unordered, align 4 @@ -374,6 +386,17 @@ define amdgpu_kernel void @global_system_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in monotonic, align 4 @@ -576,6 +599,18 @@ define amdgpu_kernel void @global_system_acquire_load( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in acquire, align 4 @@ -797,6 +832,24 @@ define amdgpu_kernel void @global_system_seq_cst_load( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in seq_cst, align 4 @@ -954,6 +1007,16 @@ define amdgpu_kernel void @global_system_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out unordered, align 4 @@ -1110,6 +1173,16 @@ define amdgpu_kernel void @global_system_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out monotonic, align 4 @@ -1295,6 +1368,21 @@ define amdgpu_kernel void @global_system_release_store( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out release, align 4 @@ -1480,6 +1568,21 @@ define amdgpu_kernel void @global_system_seq_cst_store( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out seq_cst, align 4 @@ -1634,6 +1737,16 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in monotonic @@ -1819,6 +1932,18 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acquire @@ -2002,6 +2127,21 @@ define amdgpu_kernel void @global_system_release_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in release @@ -2216,6 +2356,23 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acq_rel @@ -2430,6 +2587,23 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in seq_cst @@ -2630,6 +2804,19 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acquire @@ -2864,6 +3051,26 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acq_rel @@ -3098,6 +3305,26 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in seq_cst @@ -3317,6 +3544,20 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3567,6 +3808,22 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3815,6 +4072,25 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4094,6 +4370,27 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4373,6 +4670,27 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4623,6 +4941,22 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4873,6 +5207,22 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5152,6 +5502,27 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5431,6 +5802,27 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5710,6 +6102,27 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5989,6 +6402,27 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6238,6 +6672,22 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6507,6 +6957,23 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6809,6 +7276,30 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7111,6 +7602,30 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7384,6 +7899,25 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7653,6 +8187,23 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7955,6 +8506,30 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8257,6 +8832,30 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8559,6 +9158,30 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8861,6 +9484,30 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9159,6 +9806,28 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9461,6 +10130,30 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_relese_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9763,6 +10456,30 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10065,6 +10782,30 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10251,6 +10992,17 @@ define amdgpu_kernel void @global_system_one_as_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("one-as") unordered, align 4 @@ -10435,6 +11187,17 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("one-as") monotonic, align 4 @@ -10637,6 +11400,18 @@ define amdgpu_kernel void @global_system_one_as_acquire_load( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("one-as") acquire, align 4 @@ -10858,6 +11633,24 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("one-as") seq_cst, align 4 @@ -11015,6 +11808,16 @@ define amdgpu_kernel void @global_system_one_as_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") unordered, align 4 @@ -11171,6 +11974,16 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") monotonic, align 4 @@ -11356,6 +12169,21 @@ define amdgpu_kernel void @global_system_one_as_release_store( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") release, align 4 @@ -11541,6 +12369,21 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") seq_cst, align 4 @@ -11695,6 +12538,16 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") monotonic @@ -11880,6 +12733,18 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acquire @@ -12063,6 +12928,21 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") release @@ -12277,6 +13157,23 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acq_rel @@ -12491,6 +13388,23 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") seq_cst @@ -12691,6 +13605,19 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acquire @@ -12925,6 +13852,26 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acq_rel @@ -13159,6 +14106,26 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") seq_cst @@ -13378,6 +14345,20 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -13628,6 +14609,22 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -13876,6 +14873,25 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14155,6 +15171,27 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14434,6 +15471,27 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14684,6 +15742,22 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14934,6 +16008,22 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15213,6 +16303,27 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15492,6 +16603,27 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15771,6 +16903,27 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16050,6 +17203,27 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16329,6 +17503,27 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16608,6 +17803,27 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16887,6 +18103,27 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17166,6 +18403,27 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17415,6 +18673,22 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17684,6 +18958,23 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17964,6 +19255,27 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18266,6 +19578,30 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18568,6 +19904,30 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18841,6 +20201,25 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19110,6 +20489,23 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19412,6 +20808,30 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19714,6 +21134,30 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20016,6 +21460,30 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20318,6 +21786,30 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20616,6 +22108,28 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20918,6 +22432,30 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -21220,6 +22758,30 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -21522,6 +23084,30 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll index 8a5c5dda9f79c..10d9ee0617a0e 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll @@ -8,6 +8,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s define amdgpu_kernel void @global_volatile_load_0( ; GFX6-LABEL: global_volatile_load_0: @@ -146,6 +147,19 @@ define amdgpu_kernel void @global_volatile_load_0( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_volatile_load_0: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load volatile i32, ptr addrspace(1) %in, align 4 @@ -345,6 +359,23 @@ define amdgpu_kernel void @global_volatile_load_1( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_volatile_load_1: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_mov_b32 s4, 0x3ff +; GFX1250-CU-NEXT: v_and_b32_e64 v1, v1, s4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v1, s[2:3] scale_offset scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -501,6 +532,19 @@ define amdgpu_kernel void @global_volatile_store_0( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_volatile_store_0: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load i32, ptr addrspace(1) %in, align 4 @@ -693,6 +737,21 @@ define amdgpu_kernel void @global_volatile_store_1( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_volatile_store_1: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_mov_b32 s3, 0x3ff +; GFX1250-CU-NEXT: v_and_b32_e64 v0, v0, s3 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -838,6 +897,17 @@ define amdgpu_kernel void @global_volatile_workgroup_acquire_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_volatile_workgroup_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic volatile i32, ptr addrspace(1) %in syncscope("workgroup") acquire, align 4 @@ -969,6 +1039,17 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_volatile_workgroup_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic volatile i32 %in, ptr addrspace(1) %out syncscope("workgroup") release, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll index 151ba07a0b531..f64b283edf43f 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll @@ -12,6 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s define amdgpu_kernel void @global_wavefront_unordered_load( ; GFX6-LABEL: global_wavefront_unordered_load: @@ -190,6 +191,17 @@ define amdgpu_kernel void @global_wavefront_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront") unordered, align 4 @@ -374,6 +386,17 @@ define amdgpu_kernel void @global_wavefront_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront") monotonic, align 4 @@ -558,6 +581,17 @@ define amdgpu_kernel void @global_wavefront_acquire_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront") acquire, align 4 @@ -742,6 +776,17 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront") seq_cst, align 4 @@ -899,6 +944,16 @@ define amdgpu_kernel void @global_wavefront_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") unordered, align 4 @@ -1055,6 +1110,16 @@ define amdgpu_kernel void @global_wavefront_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") monotonic, align 4 @@ -1211,6 +1276,16 @@ define amdgpu_kernel void @global_wavefront_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") release, align 4 @@ -1367,6 +1442,16 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") seq_cst, align 4 @@ -1521,6 +1606,16 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") monotonic @@ -1675,6 +1770,16 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acquire @@ -1829,6 +1934,16 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") release @@ -1983,6 +2098,16 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acq_rel @@ -2137,6 +2262,16 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") seq_cst @@ -2319,6 +2454,18 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acquire @@ -2502,6 +2649,18 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acq_rel @@ -2685,6 +2844,18 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") seq_cst @@ -2904,6 +3075,20 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3123,6 +3308,20 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3342,6 +3541,20 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3561,6 +3774,20 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3780,6 +4007,20 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3999,6 +4240,20 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4218,6 +4473,20 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4437,6 +4706,20 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4656,6 +4939,20 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4875,6 +5172,20 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5094,6 +5405,20 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5313,6 +5638,20 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5532,6 +5871,20 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5751,6 +6104,20 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5970,6 +6337,20 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6219,6 +6600,22 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6470,6 +6867,22 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6721,6 +7134,22 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6972,6 +7401,22 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7223,6 +7668,22 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7474,6 +7935,22 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7725,6 +8202,22 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7976,6 +8469,22 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8227,6 +8736,22 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8478,6 +9003,22 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8729,6 +9270,22 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8980,6 +9537,22 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9231,6 +9804,22 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9482,6 +10071,22 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9733,6 +10338,22 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9919,6 +10540,17 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront-one-as") unordered, align 4 @@ -10103,6 +10735,17 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront-one-as") monotonic, align 4 @@ -10287,6 +10930,17 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront-one-as") acquire, align 4 @@ -10471,6 +11125,17 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront-one-as") seq_cst, align 4 @@ -10628,6 +11293,16 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") unordered, align 4 @@ -10784,6 +11459,16 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") monotonic, align 4 @@ -10940,6 +11625,16 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") release, align 4 @@ -11096,6 +11791,16 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") seq_cst, align 4 @@ -11250,6 +11955,16 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") monotonic @@ -11404,6 +12119,16 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acquire @@ -11558,6 +12283,16 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") release @@ -11712,6 +12447,16 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acq_rel @@ -11866,6 +12611,16 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") seq_cst @@ -12048,6 +12803,18 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acquire @@ -12231,6 +12998,18 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acq_rel @@ -12414,6 +13193,18 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") seq_cst @@ -12633,6 +13424,20 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -12852,6 +13657,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -13071,6 +13890,20 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -13290,6 +14123,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -13509,6 +14356,20 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -13728,6 +14589,20 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -13947,6 +14822,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14166,6 +15055,20 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14385,6 +15288,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14604,6 +15521,20 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14823,6 +15754,20 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15042,6 +15987,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15261,6 +16220,20 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15480,6 +16453,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15699,6 +16686,20 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15948,6 +16949,22 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16199,6 +17216,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16450,6 +17483,22 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16701,6 +17750,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16952,6 +18017,22 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17203,6 +18284,22 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17454,6 +18551,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17705,6 +18818,22 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17956,6 +19085,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18207,6 +19352,22 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18458,6 +19619,22 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18709,6 +19886,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18960,6 +20153,22 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19211,6 +20420,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19462,6 +20687,22 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll index 69b0c7f93ab0e..c1879c8eb11af 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll @@ -12,6 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s define amdgpu_kernel void @global_workgroup_unordered_load( ; GFX6-LABEL: global_workgroup_unordered_load: @@ -190,6 +191,17 @@ define amdgpu_kernel void @global_workgroup_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup") unordered, align 4 @@ -374,6 +386,17 @@ define amdgpu_kernel void @global_workgroup_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup") monotonic, align 4 @@ -563,6 +586,17 @@ define amdgpu_kernel void @global_workgroup_acquire_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup") acquire, align 4 @@ -764,6 +798,18 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup") seq_cst, align 4 @@ -921,6 +967,16 @@ define amdgpu_kernel void @global_workgroup_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup") unordered, align 4 @@ -1077,6 +1133,16 @@ define amdgpu_kernel void @global_workgroup_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup") monotonic, align 4 @@ -1251,6 +1317,17 @@ define amdgpu_kernel void @global_workgroup_release_store( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup") release, align 4 @@ -1425,6 +1502,17 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup") seq_cst, align 4 @@ -1579,6 +1667,16 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") monotonic @@ -1743,6 +1841,16 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") acquire @@ -1915,6 +2023,17 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") release @@ -2097,6 +2216,17 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") acq_rel @@ -2279,6 +2409,17 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst @@ -2466,6 +2607,18 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") acquire @@ -2674,6 +2827,19 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") acq_rel @@ -2882,6 +3048,19 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst @@ -3101,6 +3280,20 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3330,6 +3523,20 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3567,6 +3774,21 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3814,6 +4036,21 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4061,6 +4298,21 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4290,6 +4542,20 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4519,6 +4785,20 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4766,6 +5046,21 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5013,6 +5308,21 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5260,6 +5570,21 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5507,6 +5832,21 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5754,6 +6094,21 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6001,6 +6356,21 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6248,6 +6618,21 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6495,6 +6880,21 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6744,6 +7144,22 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7000,6 +7416,22 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7269,6 +7701,23 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7545,6 +7994,23 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7821,6 +8287,23 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8079,6 +8562,22 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8335,6 +8834,22 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8611,6 +9126,23 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8887,6 +9419,23 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9163,6 +9712,23 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9439,6 +10005,23 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9713,6 +10296,23 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9989,6 +10589,23 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10265,6 +10882,23 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10541,6 +11175,23 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10727,6 +11378,17 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup-one-as") unordered, align 4 @@ -10911,6 +11573,17 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup-one-as") monotonic, align 4 @@ -11100,6 +11773,17 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup-one-as") acquire, align 4 @@ -11297,6 +11981,17 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup-one-as") seq_cst, align 4 @@ -11454,6 +12149,16 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup-one-as") unordered, align 4 @@ -11610,6 +12315,16 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup-one-as") monotonic, align 4 @@ -11776,6 +12491,16 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup-one-as") release, align 4 @@ -11942,6 +12667,16 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup-one-as") seq_cst, align 4 @@ -12096,6 +12831,16 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") monotonic @@ -12260,6 +13005,16 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") acquire @@ -12424,6 +13179,16 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") release @@ -12598,6 +13363,16 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") acq_rel @@ -12772,6 +13547,16 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") seq_cst @@ -12959,6 +13744,18 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") acquire @@ -13159,6 +13956,18 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") acq_rel @@ -13359,6 +14168,18 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") seq_cst @@ -13578,6 +14399,20 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -13807,6 +14642,20 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14036,6 +14885,20 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14275,6 +15138,20 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14514,6 +15391,20 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14743,6 +15634,20 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14972,6 +15877,20 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15211,6 +16130,20 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15450,6 +16383,20 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15689,6 +16636,20 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15928,6 +16889,20 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16167,6 +17142,20 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16406,6 +17395,20 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16645,6 +17648,20 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16884,6 +17901,20 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17133,6 +18164,22 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17389,6 +18436,22 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17650,6 +18713,22 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17918,6 +18997,22 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18186,6 +19281,22 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18444,6 +19555,22 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18700,6 +19827,22 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18968,6 +20111,22 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19236,6 +20395,22 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19504,6 +20679,22 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19772,6 +20963,22 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20038,6 +21245,22 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20306,6 +21529,22 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20574,6 +21813,22 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20842,6 +22097,22 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll index 0467c5047a0be..fe703f5e8c90f 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll @@ -12,6 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s define amdgpu_kernel void @local_agent_unordered_load( ; GFX6-LABEL: local_agent_unordered_load: @@ -177,6 +178,18 @@ define amdgpu_kernel void @local_agent_unordered_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("agent") unordered, align 4 @@ -348,6 +361,18 @@ define amdgpu_kernel void @local_agent_monotonic_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("agent") monotonic, align 4 @@ -524,6 +549,18 @@ define amdgpu_kernel void @local_agent_acquire_load( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("agent") acquire, align 4 @@ -718,6 +755,19 @@ define amdgpu_kernel void @local_agent_seq_cst_load( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("agent") seq_cst, align 4 @@ -859,6 +909,16 @@ define amdgpu_kernel void @local_agent_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("agent") unordered, align 4 @@ -999,6 +1059,16 @@ define amdgpu_kernel void @local_agent_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("agent") monotonic, align 4 @@ -1157,6 +1227,17 @@ define amdgpu_kernel void @local_agent_release_store( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("agent") release, align 4 @@ -1315,6 +1396,17 @@ define amdgpu_kernel void @local_agent_seq_cst_store( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("agent") seq_cst, align 4 @@ -1455,6 +1547,16 @@ define amdgpu_kernel void @local_agent_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") monotonic @@ -1611,6 +1713,17 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") acquire @@ -1769,6 +1882,17 @@ define amdgpu_kernel void @local_agent_release_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") release @@ -1943,6 +2067,18 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") acq_rel @@ -2117,6 +2253,18 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") seq_cst @@ -2304,6 +2452,19 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") acquire @@ -2510,6 +2671,20 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") acq_rel @@ -2716,6 +2891,20 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") seq_cst @@ -2883,6 +3072,18 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3066,6 +3267,19 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3251,6 +3465,19 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3452,6 +3679,20 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3653,6 +3894,20 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3836,6 +4091,19 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4019,6 +4287,19 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4220,6 +4501,20 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4421,6 +4716,20 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4622,6 +4931,20 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4823,6 +5146,20 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5024,6 +5361,20 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5225,6 +5576,20 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5426,6 +5791,20 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5627,6 +6006,20 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5836,6 +6229,21 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6052,6 +6460,21 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6281,6 +6704,22 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6515,6 +6954,22 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6749,6 +7204,22 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6965,6 +7436,21 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7181,6 +7667,21 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7415,6 +7916,22 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7649,6 +8166,22 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7883,6 +8416,22 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8117,6 +8666,22 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8351,6 +8916,22 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8585,6 +9166,22 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8819,6 +9416,22 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -9053,6 +9666,22 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -9226,6 +9855,18 @@ define amdgpu_kernel void @local_agent_one_as_unordered_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("agent-one-as") unordered, align 4 @@ -9397,6 +10038,18 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("agent-one-as") monotonic, align 4 @@ -9568,6 +10221,18 @@ define amdgpu_kernel void @local_agent_one_as_acquire_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("agent-one-as") acquire, align 4 @@ -9739,6 +10404,18 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("agent-one-as") seq_cst, align 4 @@ -9880,6 +10557,16 @@ define amdgpu_kernel void @local_agent_one_as_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("agent-one-as") unordered, align 4 @@ -10020,6 +10707,16 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("agent-one-as") monotonic, align 4 @@ -10160,6 +10857,16 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("agent-one-as") release, align 4 @@ -10300,6 +11007,16 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("agent-one-as") seq_cst, align 4 @@ -10440,6 +11157,16 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent-one-as") monotonic @@ -10580,6 +11307,16 @@ define amdgpu_kernel void @local_agent_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent-one-as") acquire @@ -10720,6 +11457,16 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent-one-as") release @@ -10860,6 +11607,16 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent-one-as") acq_rel @@ -11000,6 +11757,16 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent-one-as") seq_cst @@ -11182,6 +11949,19 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent-one-as") acquire @@ -11365,6 +12145,19 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent-one-as") acq_rel @@ -11548,6 +12341,19 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent-one-as") seq_cst @@ -11715,6 +12521,18 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -11882,6 +12700,18 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12049,6 +12879,18 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12216,6 +13058,18 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12383,6 +13237,18 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12550,6 +13416,18 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12717,6 +13595,18 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12884,6 +13774,18 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13051,6 +13953,18 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13218,6 +14132,18 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13385,6 +14311,18 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13552,6 +14490,18 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13719,6 +14669,18 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13886,6 +14848,18 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14053,6 +15027,18 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14262,6 +15248,21 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14473,6 +15474,21 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14684,6 +15700,21 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14895,6 +15926,21 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15106,6 +16152,21 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15317,6 +16378,21 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15528,6 +16604,21 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15739,6 +16830,21 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15950,6 +17056,21 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16161,6 +17282,21 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16372,6 +17508,21 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16583,6 +17734,21 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16794,6 +17960,21 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -17005,6 +18186,21 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -17216,6 +18412,21 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll index 78209ee34cad4..689932469d78d 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll @@ -12,6 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s define amdgpu_kernel void @local_nontemporal_load_0( ; GFX6-LABEL: local_nontemporal_load_0: @@ -193,6 +194,18 @@ define amdgpu_kernel void @local_nontemporal_load_0( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_nontemporal_load_0: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ds_load_b32 v1, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(1) %out) { entry: %val = load i32, ptr addrspace(3) %in, align 4, !nontemporal !0 @@ -428,6 +441,22 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_nontemporal_load_1: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_mov_b32 s2, 0x3ff +; GFX1250-CU-NEXT: v_and_b32_e64 v1, v1, s2 +; GFX1250-CU-NEXT: s_mov_b32 s2, 2 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_lshl_add_u32 v1, v1, s2, s3 +; GFX1250-CU-NEXT: ds_load_b32 v1, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(1) %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -597,6 +626,18 @@ define amdgpu_kernel void @local_nontemporal_store_0( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_nontemporal_store_0: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(3) %out) { entry: %val = load i32, ptr addrspace(1) %in, align 4 @@ -802,6 +843,22 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_nontemporal_store_1: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_mov_b32 s1, 0x3ff +; GFX1250-CU-NEXT: v_and_b32_e64 v0, v0, s1 +; GFX1250-CU-NEXT: s_mov_b32 s1, 2 +; GFX1250-CU-NEXT: v_lshl_add_u32 v0, v0, s1, s2 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(3) %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -991,6 +1048,18 @@ define amdgpu_kernel void @local_nontemporal_volatile_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_nontemporal_volatile_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ds_load_b32 v1, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(1) %out) { entry: %val = load volatile i32, ptr addrspace(3) %in, align 4, !nontemporal !0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll index f84d451f8ecb0..97c80ece2b053 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll @@ -12,6 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s define amdgpu_kernel void @local_singlethread_unordered_load( ; GFX6-LABEL: local_singlethread_unordered_load: @@ -177,6 +178,18 @@ define amdgpu_kernel void @local_singlethread_unordered_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("singlethread") unordered, align 4 @@ -348,6 +361,18 @@ define amdgpu_kernel void @local_singlethread_monotonic_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("singlethread") monotonic, align 4 @@ -519,6 +544,18 @@ define amdgpu_kernel void @local_singlethread_acquire_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("singlethread") acquire, align 4 @@ -690,6 +727,18 @@ define amdgpu_kernel void @local_singlethread_seq_cst_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("singlethread") seq_cst, align 4 @@ -831,6 +880,16 @@ define amdgpu_kernel void @local_singlethread_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("singlethread") unordered, align 4 @@ -971,6 +1030,16 @@ define amdgpu_kernel void @local_singlethread_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("singlethread") monotonic, align 4 @@ -1111,6 +1180,16 @@ define amdgpu_kernel void @local_singlethread_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("singlethread") release, align 4 @@ -1251,6 +1330,16 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("singlethread") seq_cst, align 4 @@ -1391,6 +1480,16 @@ define amdgpu_kernel void @local_singlethread_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread") monotonic @@ -1531,6 +1630,16 @@ define amdgpu_kernel void @local_singlethread_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread") acquire @@ -1671,6 +1780,16 @@ define amdgpu_kernel void @local_singlethread_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread") release @@ -1811,6 +1930,16 @@ define amdgpu_kernel void @local_singlethread_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread") acq_rel @@ -1951,6 +2080,16 @@ define amdgpu_kernel void @local_singlethread_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread") seq_cst @@ -2133,6 +2272,19 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread") acquire @@ -2316,6 +2468,19 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread") acq_rel @@ -2499,6 +2664,19 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread") seq_cst @@ -2666,6 +2844,18 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -2833,6 +3023,18 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3000,6 +3202,18 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3167,6 +3381,18 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3334,6 +3560,18 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3501,6 +3739,18 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3668,6 +3918,18 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3835,6 +4097,18 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4002,6 +4276,18 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4169,6 +4455,18 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4336,6 +4634,18 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4503,6 +4813,18 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4670,6 +4992,18 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4837,6 +5171,18 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5004,6 +5350,18 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5213,6 +5571,21 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5424,6 +5797,21 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5635,6 +6023,21 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5846,6 +6249,21 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6057,6 +6475,21 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6268,6 +6701,21 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6479,6 +6927,21 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6690,6 +7153,21 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6901,6 +7379,21 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7112,6 +7605,21 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7323,6 +7831,21 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7534,6 +8057,21 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7745,6 +8283,21 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7956,6 +8509,21 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8167,6 +8735,21 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8340,6 +8923,18 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("singlethread-one-as") unordered, align 4 @@ -8511,6 +9106,18 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("singlethread-one-as") monotonic, align 4 @@ -8682,6 +9289,18 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("singlethread-one-as") acquire, align 4 @@ -8853,6 +9472,18 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("singlethread-one-as") seq_cst, align 4 @@ -8994,6 +9625,16 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("singlethread-one-as") unordered, align 4 @@ -9134,6 +9775,16 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("singlethread-one-as") monotonic, align 4 @@ -9274,6 +9925,16 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("singlethread-one-as") release, align 4 @@ -9414,6 +10075,16 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("singlethread-one-as") seq_cst, align 4 @@ -9554,6 +10225,16 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread-one-as") monotonic @@ -9694,6 +10375,16 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread-one-as") acquire @@ -9834,6 +10525,16 @@ define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread-one-as") release @@ -9974,6 +10675,16 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread-one-as") acq_rel @@ -10114,6 +10825,16 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread-one-as") seq_cst @@ -10296,6 +11017,19 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread-one-as") acquire @@ -10479,6 +11213,19 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread-one-as") acq_rel @@ -10662,6 +11409,19 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread-one-as") seq_cst @@ -10829,6 +11589,18 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_cmpxchg ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -10996,6 +11768,18 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -11163,6 +11947,18 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -11330,6 +12126,18 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -11497,6 +12305,18 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -11664,6 +12484,18 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -11831,6 +12663,18 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -11998,6 +12842,18 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12165,6 +13021,18 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12332,6 +13200,18 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12499,6 +13379,18 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12666,6 +13558,18 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12833,6 +13737,18 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13000,6 +13916,18 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13167,6 +14095,18 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13376,6 +14316,21 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_ret_cmp ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13587,6 +14542,21 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13798,6 +14768,21 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14009,6 +14994,21 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14220,6 +15220,21 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14431,6 +15446,21 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14642,6 +15672,21 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14853,6 +15898,21 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15064,6 +16124,21 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15275,6 +16350,21 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15486,6 +16576,21 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15697,6 +16802,21 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15908,6 +17028,21 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16119,6 +17254,21 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16330,6 +17480,21 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll index 74a297241d851..fdf69a5998652 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll @@ -12,6 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s define amdgpu_kernel void @local_system_unordered_load( ; GFX6-LABEL: local_system_unordered_load: @@ -177,6 +178,18 @@ define amdgpu_kernel void @local_system_unordered_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in unordered, align 4 @@ -348,6 +361,18 @@ define amdgpu_kernel void @local_system_monotonic_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in monotonic, align 4 @@ -524,6 +549,18 @@ define amdgpu_kernel void @local_system_acquire_load( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in acquire, align 4 @@ -718,6 +755,19 @@ define amdgpu_kernel void @local_system_seq_cst_load( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in seq_cst, align 4 @@ -859,6 +909,16 @@ define amdgpu_kernel void @local_system_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out unordered, align 4 @@ -999,6 +1059,16 @@ define amdgpu_kernel void @local_system_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out monotonic, align 4 @@ -1157,6 +1227,17 @@ define amdgpu_kernel void @local_system_release_store( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out release, align 4 @@ -1315,6 +1396,17 @@ define amdgpu_kernel void @local_system_seq_cst_store( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out seq_cst, align 4 @@ -1455,6 +1547,16 @@ define amdgpu_kernel void @local_system_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in monotonic @@ -1611,6 +1713,17 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in acquire @@ -1769,6 +1882,17 @@ define amdgpu_kernel void @local_system_release_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in release @@ -1943,6 +2067,18 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in acq_rel @@ -2117,6 +2253,18 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in seq_cst @@ -2304,6 +2452,19 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in acquire @@ -2510,6 +2671,20 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in acq_rel @@ -2716,6 +2891,20 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in seq_cst @@ -2883,6 +3072,18 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3066,6 +3267,19 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3251,6 +3465,19 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3452,6 +3679,20 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3653,6 +3894,20 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3836,6 +4091,19 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4019,6 +4287,19 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4220,6 +4501,20 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4421,6 +4716,20 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4622,6 +4931,20 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4823,6 +5146,20 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5024,6 +5361,20 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5225,6 +5576,20 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5426,6 +5791,20 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5627,6 +6006,20 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5836,6 +6229,21 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6052,6 +6460,21 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6281,6 +6704,22 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6515,6 +6954,22 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6749,6 +7204,22 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6965,6 +7436,21 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7181,6 +7667,21 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7415,6 +7916,22 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7649,6 +8166,22 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7883,6 +8416,22 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8117,6 +8666,22 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8351,6 +8916,22 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8585,6 +9166,22 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8819,6 +9416,22 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -9053,6 +9666,22 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -9226,6 +9855,18 @@ define amdgpu_kernel void @local_system_one_as_unordered_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("one-as") unordered, align 4 @@ -9397,6 +10038,18 @@ define amdgpu_kernel void @local_system_one_as_monotonic_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("one-as") monotonic, align 4 @@ -9568,6 +10221,18 @@ define amdgpu_kernel void @local_system_one_as_acquire_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("one-as") acquire, align 4 @@ -9739,6 +10404,18 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("one-as") seq_cst, align 4 @@ -9880,6 +10557,16 @@ define amdgpu_kernel void @local_system_one_as_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("one-as") unordered, align 4 @@ -10020,6 +10707,16 @@ define amdgpu_kernel void @local_system_one_as_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("one-as") monotonic, align 4 @@ -10160,6 +10857,16 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("one-as") release, align 4 @@ -10300,6 +11007,16 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("one-as") seq_cst, align 4 @@ -10440,6 +11157,16 @@ define amdgpu_kernel void @local_system_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("one-as") monotonic @@ -10580,6 +11307,16 @@ define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("one-as") acquire @@ -10720,6 +11457,16 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("one-as") release @@ -10860,6 +11607,16 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("one-as") acq_rel @@ -11000,6 +11757,16 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("one-as") seq_cst @@ -11182,6 +11949,19 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("one-as") acquire @@ -11365,6 +12145,19 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("one-as") acq_rel @@ -11548,6 +12341,19 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("one-as") seq_cst @@ -11715,6 +12521,18 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -11882,6 +12700,18 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12049,6 +12879,18 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12216,6 +13058,18 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12383,6 +13237,18 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12550,6 +13416,18 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12717,6 +13595,18 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12884,6 +13774,18 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13051,6 +13953,18 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13218,6 +14132,18 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13385,6 +14311,18 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13552,6 +14490,18 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13719,6 +14669,18 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13886,6 +14848,18 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14053,6 +15027,18 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14262,6 +15248,21 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14473,6 +15474,21 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14684,6 +15700,21 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14895,6 +15926,21 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15106,6 +16152,21 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15317,6 +16378,21 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15528,6 +16604,21 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15739,6 +16830,21 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15950,6 +17056,21 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16161,6 +17282,21 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16372,6 +17508,21 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16583,6 +17734,21 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16794,6 +17960,21 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -17005,6 +18186,21 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -17216,6 +18412,21 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll index 5e5e3bf83d610..88cba0bddf5d7 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll @@ -8,6 +8,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s define amdgpu_kernel void @local_volatile_load_0( ; GFX6-LABEL: local_volatile_load_0: @@ -141,6 +142,18 @@ define amdgpu_kernel void @local_volatile_load_0( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_volatile_load_0: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ds_load_b32 v1, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(1) %out) { entry: %val = load volatile i32, ptr addrspace(3) %in, align 4 @@ -308,6 +321,22 @@ define amdgpu_kernel void @local_volatile_load_1( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_volatile_load_1: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_mov_b32 s2, 0x3ff +; GFX1250-CU-NEXT: v_and_b32_e64 v1, v1, s2 +; GFX1250-CU-NEXT: s_mov_b32 s2, 2 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_lshl_add_u32 v1, v1, s2, s3 +; GFX1250-CU-NEXT: ds_load_b32 v1, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(1) %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -429,6 +458,18 @@ define amdgpu_kernel void @local_volatile_store_0( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_volatile_store_0: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(3) %out) { entry: %val = load i32, ptr addrspace(1) %in, align 4 @@ -570,6 +611,22 @@ define amdgpu_kernel void @local_volatile_store_1( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_volatile_store_1: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_mov_b32 s1, 0x3ff +; GFX1250-CU-NEXT: v_and_b32_e64 v0, v0, s1 +; GFX1250-CU-NEXT: s_mov_b32 s1, 2 +; GFX1250-CU-NEXT: v_lshl_add_u32 v0, v0, s1, s2 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(3) %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -698,6 +755,18 @@ define amdgpu_kernel void @local_volatile_workgroup_acquire_load( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_volatile_workgroup_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic volatile i32, ptr addrspace(3) %in syncscope("workgroup") acquire, align 4 @@ -813,6 +882,17 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_volatile_workgroup_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic volatile i32 %in, ptr addrspace(3) %out syncscope("workgroup") release, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll index b24622a48a16b..b8ad75049aff8 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll @@ -12,6 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s define amdgpu_kernel void @local_wavefront_unordered_load( ; GFX6-LABEL: local_wavefront_unordered_load: @@ -177,6 +178,18 @@ define amdgpu_kernel void @local_wavefront_unordered_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("wavefront") unordered, align 4 @@ -348,6 +361,18 @@ define amdgpu_kernel void @local_wavefront_monotonic_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("wavefront") monotonic, align 4 @@ -519,6 +544,18 @@ define amdgpu_kernel void @local_wavefront_acquire_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("wavefront") acquire, align 4 @@ -690,6 +727,18 @@ define amdgpu_kernel void @local_wavefront_seq_cst_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("wavefront") seq_cst, align 4 @@ -831,6 +880,16 @@ define amdgpu_kernel void @local_wavefront_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("wavefront") unordered, align 4 @@ -971,6 +1030,16 @@ define amdgpu_kernel void @local_wavefront_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("wavefront") monotonic, align 4 @@ -1111,6 +1180,16 @@ define amdgpu_kernel void @local_wavefront_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("wavefront") release, align 4 @@ -1251,6 +1330,16 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("wavefront") seq_cst, align 4 @@ -1391,6 +1480,16 @@ define amdgpu_kernel void @local_wavefront_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront") monotonic @@ -1531,6 +1630,16 @@ define amdgpu_kernel void @local_wavefront_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront") acquire @@ -1671,6 +1780,16 @@ define amdgpu_kernel void @local_wavefront_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront") release @@ -1811,6 +1930,16 @@ define amdgpu_kernel void @local_wavefront_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront") acq_rel @@ -1951,6 +2080,16 @@ define amdgpu_kernel void @local_wavefront_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront") seq_cst @@ -2133,6 +2272,19 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront") acquire @@ -2316,6 +2468,19 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront") acq_rel @@ -2499,6 +2664,19 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront") seq_cst @@ -2666,6 +2844,18 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -2833,6 +3023,18 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3000,6 +3202,18 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3167,6 +3381,18 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3334,6 +3560,18 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3501,6 +3739,18 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3668,6 +3918,18 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3835,6 +4097,18 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4002,6 +4276,18 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4169,6 +4455,18 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4336,6 +4634,18 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4503,6 +4813,18 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4670,6 +4992,18 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4837,6 +5171,18 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5004,6 +5350,18 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5213,6 +5571,21 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5424,6 +5797,21 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5635,6 +6023,21 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5846,6 +6249,21 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6057,6 +6475,21 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6268,6 +6701,21 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6479,6 +6927,21 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6690,6 +7153,21 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6901,6 +7379,21 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7112,6 +7605,21 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7323,6 +7831,21 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7534,6 +8057,21 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7745,6 +8283,21 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7956,6 +8509,21 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8167,6 +8735,21 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8340,6 +8923,18 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("wavefront-one-as") unordered, align 4 @@ -8511,6 +9106,18 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("wavefront-one-as") monotonic, align 4 @@ -8682,6 +9289,18 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("wavefront-one-as") acquire, align 4 @@ -8853,6 +9472,18 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("wavefront-one-as") seq_cst, align 4 @@ -8994,6 +9625,16 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("wavefront-one-as") unordered, align 4 @@ -9134,6 +9775,16 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("wavefront-one-as") monotonic, align 4 @@ -9274,6 +9925,16 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("wavefront-one-as") release, align 4 @@ -9414,6 +10075,16 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("wavefront-one-as") seq_cst, align 4 @@ -9554,6 +10225,16 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront-one-as") monotonic @@ -9694,6 +10375,16 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront-one-as") acquire @@ -9834,6 +10525,16 @@ define amdgpu_kernel void @local_wavefront_one_as_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront-one-as") release @@ -9974,6 +10675,16 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront-one-as") acq_rel @@ -10114,6 +10825,16 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront-one-as") seq_cst @@ -10296,6 +11017,19 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront-one-as") acquire @@ -10479,6 +11213,19 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront-one-as") acq_rel @@ -10662,6 +11409,19 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront-one-as") seq_cst @@ -10829,6 +11589,18 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -10996,6 +11768,18 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -11163,6 +11947,18 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -11330,6 +12126,18 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -11497,6 +12305,18 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -11664,6 +12484,18 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -11831,6 +12663,18 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -11998,6 +12842,18 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12165,6 +13021,18 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12332,6 +13200,18 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12499,6 +13379,18 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12666,6 +13558,18 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12833,6 +13737,18 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13000,6 +13916,18 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13167,6 +14095,18 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13376,6 +14316,21 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_ret_cmpxch ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13587,6 +14542,21 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13798,6 +14768,21 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14009,6 +14994,21 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14220,6 +15220,21 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14431,6 +15446,21 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14642,6 +15672,21 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14853,6 +15898,21 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15064,6 +16124,21 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15275,6 +16350,21 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15486,6 +16576,21 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15697,6 +16802,21 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15908,6 +17028,21 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16119,6 +17254,21 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16330,6 +17480,21 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 From 7f7b1bbc3f017bb93ada95a48cc400d4ec9a297d Mon Sep 17 00:00:00 2001 From: pvanhout Date: Thu, 21 Aug 2025 12:43:36 +0200 Subject: [PATCH 2/2] Drop -CU suffix --- .../memory-legalizer-fence-mmra-global.ll | 294 +- .../memory-legalizer-fence-mmra-local.ll | 152 +- .../CodeGen/AMDGPU/memory-legalizer-fence.ll | 398 +- .../AMDGPU/memory-legalizer-flat-agent.ll | 3118 ++++++++-------- .../AMDGPU/memory-legalizer-flat-lastuse.ll | 94 +- .../memory-legalizer-flat-nontemporal.ll | 120 +- .../memory-legalizer-flat-singlethread.ll | 2298 ++++++------ .../AMDGPU/memory-legalizer-flat-system.ll | 3234 ++++++++--------- .../AMDGPU/memory-legalizer-flat-volatile.ll | 148 +- .../AMDGPU/memory-legalizer-flat-wavefront.ll | 2268 ++++++------ .../AMDGPU/memory-legalizer-flat-workgroup.ll | 2270 ++++++------ .../AMDGPU/memory-legalizer-global-agent.ll | 3044 ++++++++-------- .../AMDGPU/memory-legalizer-global-lastuse.ll | 104 +- .../memory-legalizer-global-nontemporal.ll | 124 +- .../memory-legalizer-global-singlethread.ll | 2298 ++++++------ .../AMDGPU/memory-legalizer-global-system.ll | 2998 +++++++-------- .../memory-legalizer-global-volatile.ll | 150 +- .../memory-legalizer-global-wavefront.ll | 2298 ++++++------ .../memory-legalizer-global-workgroup.ll | 2358 ++++++------ .../AMDGPU/memory-legalizer-local-agent.ll | 2238 ++++++------ .../memory-legalizer-local-nontemporal.ll | 128 +- .../memory-legalizer-local-singlethread.ll | 2146 +++++------ .../AMDGPU/memory-legalizer-local-system.ll | 2238 ++++++------ .../AMDGPU/memory-legalizer-local-volatile.ll | 148 +- .../memory-legalizer-local-wavefront.ll | 2146 +++++------ .../memory-legalizer-local-workgroup.ll | 2238 ++++++------ 26 files changed, 19525 insertions(+), 19525 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll index 20822c71198b6..97d52d5f1f26d 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll @@ -12,7 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @workgroup_acquire_fence() { ; GFX6-LABEL: workgroup_acquire_fence: @@ -80,9 +80,9 @@ define amdgpu_kernel void @workgroup_acquire_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: workgroup_acquire_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: workgroup_acquire_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup") acquire, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -151,9 +151,9 @@ define amdgpu_kernel void @workgroup_release_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: workgroup_release_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: workgroup_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup") release, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -227,9 +227,9 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: workgroup_acq_rel_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: workgroup_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -303,9 +303,9 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: workgroup_seq_cst_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: workgroup_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -377,9 +377,9 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: workgroup_one_as_acquire_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: workgroup_one_as_acquire_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -448,9 +448,9 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: workgroup_one_as_release_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: workgroup_one_as_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") release, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -524,9 +524,9 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: workgroup_one_as_acq_rel_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: workgroup_one_as_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -600,9 +600,9 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: workgroup_one_as_seq_cst_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: workgroup_one_as_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -696,12 +696,12 @@ define amdgpu_kernel void @agent_acquire_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: agent_acquire_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: agent_acquire_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent") acquire, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -785,13 +785,13 @@ define amdgpu_kernel void @agent_release_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: agent_release_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: agent_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent") release, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -891,14 +891,14 @@ define amdgpu_kernel void @agent_acq_rel_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: agent_acq_rel_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: agent_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -998,14 +998,14 @@ define amdgpu_kernel void @agent_seq_cst_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: agent_seq_cst_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: agent_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1099,12 +1099,12 @@ define amdgpu_kernel void @agent_one_as_acquire_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: agent_one_as_acquire_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: agent_one_as_acquire_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent-one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1188,13 +1188,13 @@ define amdgpu_kernel void @agent_one_as_release_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: agent_one_as_release_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: agent_one_as_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent-one-as") release, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1294,14 +1294,14 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: agent_one_as_acq_rel_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: agent_one_as_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent-one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1401,14 +1401,14 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: agent_one_as_seq_cst_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: agent_one_as_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent-one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1504,12 +1504,12 @@ define amdgpu_kernel void @system_acquire_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: system_acquire_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: system_acquire_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm entry: fence acquire, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1597,14 +1597,14 @@ define amdgpu_kernel void @system_release_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: system_release_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: system_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence release, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1710,15 +1710,15 @@ define amdgpu_kernel void @system_acq_rel_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: system_acq_rel_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: system_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm entry: fence acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1824,15 +1824,15 @@ define amdgpu_kernel void @system_seq_cst_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: system_seq_cst_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: system_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm entry: fence seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1928,12 +1928,12 @@ define amdgpu_kernel void @system_one_as_acquire_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: system_one_as_acquire_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: system_one_as_acquire_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm entry: fence syncscope("one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -2021,14 +2021,14 @@ define amdgpu_kernel void @system_one_as_release_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: system_one_as_release_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: system_one_as_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence syncscope("one-as") release, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -2134,15 +2134,15 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: system_one_as_acq_rel_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: system_one_as_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm entry: fence syncscope("one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -2248,15 +2248,15 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: system_one_as_seq_cst_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: system_one_as_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm entry: fence syncscope("one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll index 767dbc1432242..cc42428e1aa06 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll @@ -12,7 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @workgroup_acquire_fence() { ; GFX6-LABEL: workgroup_acquire_fence: @@ -78,10 +78,10 @@ define amdgpu_kernel void @workgroup_acquire_fence() { ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: workgroup_acquire_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: workgroup_acquire_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup") acquire, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -149,9 +149,9 @@ define amdgpu_kernel void @workgroup_release_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: workgroup_release_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: workgroup_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup") release, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -219,9 +219,9 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: workgroup_acq_rel_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: workgroup_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -289,9 +289,9 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: workgroup_seq_cst_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: workgroup_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -350,9 +350,9 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: workgroup_one_as_acquire_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: workgroup_one_as_acquire_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -411,9 +411,9 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: workgroup_one_as_release_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: workgroup_one_as_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") release, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -472,9 +472,9 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: workgroup_one_as_acq_rel_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: workgroup_one_as_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -533,9 +533,9 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: workgroup_one_as_seq_cst_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: workgroup_one_as_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -605,10 +605,10 @@ define amdgpu_kernel void @agent_acquire_fence() { ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: agent_acquire_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: agent_acquire_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent") acquire, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -676,9 +676,9 @@ define amdgpu_kernel void @agent_release_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: agent_release_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: agent_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent") release, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -746,9 +746,9 @@ define amdgpu_kernel void @agent_acq_rel_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: agent_acq_rel_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: agent_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -816,9 +816,9 @@ define amdgpu_kernel void @agent_seq_cst_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: agent_seq_cst_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: agent_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -877,9 +877,9 @@ define amdgpu_kernel void @agent_one_as_acquire_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: agent_one_as_acquire_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: agent_one_as_acquire_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent-one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -938,9 +938,9 @@ define amdgpu_kernel void @agent_one_as_release_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: agent_one_as_release_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: agent_one_as_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent-one-as") release, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -999,9 +999,9 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: agent_one_as_acq_rel_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: agent_one_as_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent-one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -1060,9 +1060,9 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: agent_one_as_seq_cst_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: agent_one_as_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent-one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -1132,10 +1132,10 @@ define amdgpu_kernel void @system_acquire_fence() { ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: system_acquire_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: system_acquire_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence acquire, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -1203,9 +1203,9 @@ define amdgpu_kernel void @system_release_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: system_release_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: system_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence release, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -1273,9 +1273,9 @@ define amdgpu_kernel void @system_acq_rel_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: system_acq_rel_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: system_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -1343,9 +1343,9 @@ define amdgpu_kernel void @system_seq_cst_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: system_seq_cst_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: system_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -1404,9 +1404,9 @@ define amdgpu_kernel void @system_one_as_acquire_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: system_one_as_acquire_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: system_one_as_acquire_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -1465,9 +1465,9 @@ define amdgpu_kernel void @system_one_as_release_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: system_one_as_release_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: system_one_as_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("one-as") release, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -1526,9 +1526,9 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: system_one_as_acq_rel_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: system_one_as_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -1587,9 +1587,9 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: system_one_as_seq_cst_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: system_one_as_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll index 8d7194b834385..b3f6533d43887 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll @@ -12,7 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @singlethread_acquire_fence() { ; GFX6-LABEL: singlethread_acquire_fence: @@ -67,9 +67,9 @@ define amdgpu_kernel void @singlethread_acquire_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: singlethread_acquire_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: singlethread_acquire_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("singlethread") acquire ret void @@ -128,9 +128,9 @@ define amdgpu_kernel void @singlethread_release_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: singlethread_release_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: singlethread_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("singlethread") release ret void @@ -189,9 +189,9 @@ define amdgpu_kernel void @singlethread_acq_rel_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: singlethread_acq_rel_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: singlethread_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("singlethread") acq_rel ret void @@ -250,9 +250,9 @@ define amdgpu_kernel void @singlethread_seq_cst_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: singlethread_seq_cst_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: singlethread_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("singlethread") seq_cst ret void @@ -311,9 +311,9 @@ define amdgpu_kernel void @singlethread_one_as_acquire_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: singlethread_one_as_acquire_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: singlethread_one_as_acquire_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("singlethread-one-as") acquire ret void @@ -372,9 +372,9 @@ define amdgpu_kernel void @singlethread_one_as_release_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: singlethread_one_as_release_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: singlethread_one_as_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("singlethread-one-as") release ret void @@ -433,9 +433,9 @@ define amdgpu_kernel void @singlethread_one_as_acq_rel_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: singlethread_one_as_acq_rel_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: singlethread_one_as_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("singlethread-one-as") acq_rel ret void @@ -494,9 +494,9 @@ define amdgpu_kernel void @singlethread_one_as_seq_cst_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: singlethread_one_as_seq_cst_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: singlethread_one_as_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("singlethread-one-as") seq_cst ret void @@ -555,9 +555,9 @@ define amdgpu_kernel void @wavefront_acquire_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: wavefront_acquire_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: wavefront_acquire_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("wavefront") acquire ret void @@ -616,9 +616,9 @@ define amdgpu_kernel void @wavefront_release_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: wavefront_release_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: wavefront_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("wavefront") release ret void @@ -677,9 +677,9 @@ define amdgpu_kernel void @wavefront_acq_rel_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: wavefront_acq_rel_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: wavefront_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("wavefront") acq_rel ret void @@ -738,9 +738,9 @@ define amdgpu_kernel void @wavefront_seq_cst_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: wavefront_seq_cst_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: wavefront_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("wavefront") seq_cst ret void @@ -799,9 +799,9 @@ define amdgpu_kernel void @wavefront_one_as_acquire_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: wavefront_one_as_acquire_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: wavefront_one_as_acquire_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("wavefront-one-as") acquire ret void @@ -860,9 +860,9 @@ define amdgpu_kernel void @wavefront_one_as_release_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: wavefront_one_as_release_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: wavefront_one_as_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("wavefront-one-as") release ret void @@ -921,9 +921,9 @@ define amdgpu_kernel void @wavefront_one_as_acq_rel_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: wavefront_one_as_acq_rel_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: wavefront_one_as_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("wavefront-one-as") acq_rel ret void @@ -982,9 +982,9 @@ define amdgpu_kernel void @wavefront_one_as_seq_cst_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: wavefront_one_as_seq_cst_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: wavefront_one_as_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("wavefront-one-as") seq_cst ret void @@ -1064,10 +1064,10 @@ define amdgpu_kernel void @workgroup_acquire_fence() { ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: workgroup_acquire_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: workgroup_acquire_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup") acquire ret void @@ -1144,10 +1144,10 @@ define amdgpu_kernel void @workgroup_release_fence() { ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: workgroup_release_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: workgroup_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup") release ret void @@ -1229,10 +1229,10 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: workgroup_acq_rel_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: workgroup_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup") acq_rel ret void @@ -1314,10 +1314,10 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: workgroup_seq_cst_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: workgroup_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup") seq_cst ret void @@ -1389,9 +1389,9 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: workgroup_one_as_acquire_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: workgroup_one_as_acquire_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") acquire ret void @@ -1460,9 +1460,9 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: workgroup_one_as_release_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: workgroup_one_as_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") release ret void @@ -1536,9 +1536,9 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: workgroup_one_as_acq_rel_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: workgroup_one_as_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") acq_rel ret void @@ -1612,9 +1612,9 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: workgroup_one_as_seq_cst_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: workgroup_one_as_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") seq_cst ret void @@ -1708,12 +1708,12 @@ define amdgpu_kernel void @agent_acquire_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: agent_acquire_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: agent_acquire_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent") acquire ret void @@ -1797,13 +1797,13 @@ define amdgpu_kernel void @agent_release_fence() { ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: agent_release_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: agent_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent") release ret void @@ -1903,14 +1903,14 @@ define amdgpu_kernel void @agent_acq_rel_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: agent_acq_rel_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: agent_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent") acq_rel ret void @@ -2010,14 +2010,14 @@ define amdgpu_kernel void @agent_seq_cst_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: agent_seq_cst_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: agent_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent") seq_cst ret void @@ -2111,12 +2111,12 @@ define amdgpu_kernel void @agent_one_as_acquire_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: agent_one_as_acquire_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: agent_one_as_acquire_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent-one-as") acquire ret void @@ -2200,13 +2200,13 @@ define amdgpu_kernel void @agent_one_as_release_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: agent_one_as_release_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: agent_one_as_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent-one-as") release ret void @@ -2306,14 +2306,14 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: agent_one_as_acq_rel_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: agent_one_as_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent-one-as") acq_rel ret void @@ -2413,14 +2413,14 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: agent_one_as_seq_cst_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: agent_one_as_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent-one-as") seq_cst ret void @@ -2516,12 +2516,12 @@ define amdgpu_kernel void @system_acquire_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: system_acquire_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: system_acquire_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm entry: fence acquire ret void @@ -2609,14 +2609,14 @@ define amdgpu_kernel void @system_release_fence() { ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: system_release_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: system_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence release ret void @@ -2722,15 +2722,15 @@ define amdgpu_kernel void @system_acq_rel_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: system_acq_rel_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: system_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm entry: fence acq_rel ret void @@ -2836,15 +2836,15 @@ define amdgpu_kernel void @system_seq_cst_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: system_seq_cst_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: system_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm entry: fence seq_cst ret void @@ -2940,12 +2940,12 @@ define amdgpu_kernel void @system_one_as_acquire_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: system_one_as_acquire_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: system_one_as_acquire_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm entry: fence syncscope("one-as") acquire ret void @@ -3033,14 +3033,14 @@ define amdgpu_kernel void @system_one_as_release_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: system_one_as_release_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: system_one_as_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence syncscope("one-as") release ret void @@ -3146,15 +3146,15 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: system_one_as_acq_rel_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: system_one_as_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm entry: fence syncscope("one-as") acq_rel ret void @@ -3260,15 +3260,15 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() { ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: system_one_as_seq_cst_fence: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: system_one_as_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm entry: fence syncscope("one-as") seq_cst ret void diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll index 05b599c6bc1c7..36adbc0011118 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll @@ -11,7 +11,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @flat_agent_unordered_load( ; GFX7-LABEL: flat_agent_unordered_load: @@ -189,16 +189,16 @@ define amdgpu_kernel void @flat_agent_unordered_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_unordered_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("agent") unordered, align 4 @@ -382,16 +382,16 @@ define amdgpu_kernel void @flat_agent_monotonic_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_monotonic_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("agent") monotonic, align 4 @@ -590,17 +590,17 @@ define amdgpu_kernel void @flat_agent_acquire_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_acquire_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("agent") acquire, align 4 @@ -825,23 +825,23 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_seq_cst_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("agent") seq_cst, align 4 @@ -993,15 +993,15 @@ define amdgpu_kernel void @flat_agent_unordered_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_unordered_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("agent") unordered, align 4 @@ -1152,15 +1152,15 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_monotonic_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("agent") monotonic, align 4 @@ -1335,19 +1335,19 @@ define amdgpu_kernel void @flat_agent_release_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("agent") release, align 4 @@ -1522,19 +1522,19 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("agent") seq_cst, align 4 @@ -1685,15 +1685,15 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_monotonic_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") monotonic @@ -1875,17 +1875,17 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_acquire_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") acquire @@ -2060,19 +2060,19 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") release @@ -2278,21 +2278,21 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") acq_rel @@ -2498,21 +2498,21 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") seq_cst @@ -2722,18 +2722,18 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_acquire_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") acquire @@ -2972,24 +2972,24 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") acq_rel @@ -3228,24 +3228,24 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") seq_cst @@ -3485,19 +3485,19 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_monotonic_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3768,21 +3768,21 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_acquire_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4046,23 +4046,23 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4357,25 +4357,25 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4670,25 +4670,25 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4959,21 +4959,21 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_monotonic_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5244,21 +5244,21 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_acquire_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5553,25 +5553,25 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5866,25 +5866,25 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6179,25 +6179,25 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6492,25 +6492,25 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6805,25 +6805,25 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_acquire_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7118,25 +7118,25 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_release_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7431,25 +7431,25 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7744,25 +7744,25 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8046,21 +8046,21 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8361,22 +8361,22 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8686,25 +8686,25 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_release_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9033,28 +9033,28 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9383,28 +9383,28 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9709,24 +9709,24 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10027,22 +10027,22 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10371,28 +10371,28 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10721,28 +10721,28 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -11071,28 +11071,28 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -11421,28 +11421,28 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -11767,26 +11767,26 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -12115,28 +12115,28 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -12465,28 +12465,28 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -12815,28 +12815,28 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -13022,16 +13022,16 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_unordered_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("agent-one-as") unordered, align 4 @@ -13215,16 +13215,16 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("agent-one-as") monotonic, align 4 @@ -13433,18 +13433,18 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_acquire_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("agent-one-as") acquire, align 4 @@ -13679,24 +13679,24 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("agent-one-as") seq_cst, align 4 @@ -13848,15 +13848,15 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_unordered_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("agent-one-as") unordered, align 4 @@ -14007,15 +14007,15 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("agent-one-as") monotonic, align 4 @@ -14190,19 +14190,19 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("agent-one-as") release, align 4 @@ -14377,19 +14377,19 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("agent-one-as") seq_cst, align 4 @@ -14540,15 +14540,15 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") monotonic @@ -14726,17 +14726,17 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_acquire_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") acquire @@ -14911,19 +14911,19 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") release @@ -15125,21 +15125,21 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") acq_rel @@ -15341,21 +15341,21 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") seq_cst @@ -15575,19 +15575,19 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") acquire @@ -15836,25 +15836,25 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") acq_rel @@ -16103,25 +16103,25 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") seq_cst @@ -16361,19 +16361,19 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16640,21 +16640,21 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16918,23 +16918,23 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17225,25 +17225,25 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17534,25 +17534,25 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17819,21 +17819,21 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18100,21 +18100,21 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18405,25 +18405,25 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18714,25 +18714,25 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19023,25 +19023,25 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19332,25 +19332,25 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19641,25 +19641,25 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19950,25 +19950,25 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20259,25 +20259,25 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20568,25 +20568,25 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20870,21 +20870,21 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21195,23 +21195,23 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21521,25 +21521,25 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21878,29 +21878,29 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -22239,29 +22239,29 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -22576,25 +22576,25 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -22905,23 +22905,23 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -23260,29 +23260,29 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -23621,29 +23621,29 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -23982,29 +23982,29 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -24343,29 +24343,29 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -24700,27 +24700,27 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -25059,29 +25059,29 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -25420,29 +25420,29 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -25781,29 +25781,29 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll index 0b5e6e9da7418..8d98f532908fe 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefix=GFX12 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefix=GFX12 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @flat_last_use_load_0(ptr %in, ptr %out) { ; GFX12-LABEL: flat_last_use_load_0: @@ -18,16 +18,16 @@ define amdgpu_kernel void @flat_last_use_load_0(ptr %in, ptr %out) { ; GFX12-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_last_use_load_0: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_LU -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_last_use_load_0: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_LU +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm entry: %val = load i32, ptr %in, align 4, !amdgpu.last.use !{} store i32 %val, ptr %out @@ -68,20 +68,20 @@ define amdgpu_kernel void @flat_last_use_load_1(ptr %in, ptr %out) { ; GFX12-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_last_use_load_1: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_mov_b32 s4, 0x3ff -; GFX1250-CU-NEXT: v_and_b32_e64 v1, v1, s4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v1, s[2:3] scale_offset th:TH_LOAD_LU -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_last_use_load_1: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s4, 0x3ff +; GFX1250-NEXT: v_and_b32_e64 v1, v1, s4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v1, s[2:3] scale_offset th:TH_LOAD_LU +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %val.gep = getelementptr inbounds i32, ptr %in, i32 %tid @@ -108,18 +108,18 @@ define amdgpu_kernel void @flat_last_use_and_volatile_load(ptr %in, ptr %out) { ; GFX12-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_last_use_and_volatile_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_last_use_and_volatile_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm entry: %val = load volatile i32, ptr %in, align 4, !amdgpu.last.use !{} store i32 %val, ptr %out @@ -141,16 +141,16 @@ define amdgpu_kernel void @flat_last_use_and_nontemporal_load(ptr %in, ptr %out) ; GFX12-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_last_use_and_nontemporal_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_LU -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_last_use_and_nontemporal_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_LU +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm entry: %val = load i32, ptr %in, align 4, !amdgpu.last.use !{}, !nontemporal !0 store i32 %val, ptr %out diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll index 58f33dfed87d6..af48eaf8fcda6 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll @@ -11,7 +11,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @flat_nontemporal_load_0( ; GFX7-LABEL: flat_nontemporal_load_0: @@ -189,16 +189,16 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_nontemporal_load_0: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_nontemporal_load_0: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load i32, ptr %in, align 4, !nontemporal !0 @@ -568,20 +568,20 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_nontemporal_load_1: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_mov_b32 s4, 0x3ff -; GFX1250-CU-NEXT: v_and_b32_e64 v1, v1, s4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v1, s[2:3] scale_offset th:TH_LOAD_NT -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_nontemporal_load_1: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s4, 0x3ff +; GFX1250-NEXT: v_and_b32_e64 v1, v1, s4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v1, s[2:3] scale_offset th:TH_LOAD_NT +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -767,16 +767,16 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 th:TH_STORE_NT ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_nontemporal_store_0: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] th:TH_STORE_NT scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_nontemporal_store_0: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] th:TH_STORE_NT scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load i32, ptr %in, align 4 @@ -1134,19 +1134,19 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 th:TH_STORE_NT ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_nontemporal_store_1: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v1, s[2:3] -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_mov_b32 s2, 0x3ff -; GFX1250-CU-NEXT: v_and_b32_e64 v0, v0, s2 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scale_offset th:TH_STORE_NT scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_nontemporal_store_1: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v1, s[2:3] +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s2, 0x3ff +; GFX1250-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scale_offset th:TH_STORE_NT scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1346,18 +1346,18 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_nontemporal_volatile_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_nontemporal_volatile_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load volatile i32, ptr %in, align 4, !nontemporal !0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll index ed90e278d1e86..871c941dd6dca 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll @@ -11,7 +11,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @flat_singlethread_unordered_load( ; GFX7-LABEL: flat_singlethread_unordered_load: @@ -189,16 +189,16 @@ define amdgpu_kernel void @flat_singlethread_unordered_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_unordered_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("singlethread") unordered, align 4 @@ -382,16 +382,16 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_monotonic_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("singlethread") monotonic, align 4 @@ -575,16 +575,16 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_acquire_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("singlethread") acquire, align 4 @@ -768,16 +768,16 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_seq_cst_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("singlethread") seq_cst, align 4 @@ -929,15 +929,15 @@ define amdgpu_kernel void @flat_singlethread_unordered_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_unordered_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("singlethread") unordered, align 4 @@ -1088,15 +1088,15 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_monotonic_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("singlethread") monotonic, align 4 @@ -1247,15 +1247,15 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("singlethread") release, align 4 @@ -1406,15 +1406,15 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("singlethread") seq_cst, align 4 @@ -1565,15 +1565,15 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_monotonic_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") monotonic @@ -1724,15 +1724,15 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_acquire_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acquire @@ -1883,15 +1883,15 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") release @@ -2042,15 +2042,15 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acq_rel @@ -2201,15 +2201,15 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") seq_cst @@ -2404,17 +2404,17 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_acquire_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acquire @@ -2610,17 +2610,17 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acq_rel @@ -2816,17 +2816,17 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") seq_cst @@ -3066,19 +3066,19 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3318,19 +3318,19 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3570,19 +3570,19 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3822,19 +3822,19 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4074,19 +4074,19 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4326,19 +4326,19 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4578,19 +4578,19 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4830,19 +4830,19 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5082,19 +5082,19 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5334,19 +5334,19 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5586,19 +5586,19 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5838,19 +5838,19 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6090,19 +6090,19 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6342,19 +6342,19 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6594,19 +6594,19 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6890,21 +6890,21 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7190,21 +7190,21 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7490,21 +7490,21 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7790,21 +7790,21 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8090,21 +8090,21 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8390,21 +8390,21 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8690,21 +8690,21 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8990,21 +8990,21 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9290,21 +9290,21 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9590,21 +9590,21 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9890,21 +9890,21 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10190,21 +10190,21 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10490,21 +10490,21 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10790,21 +10790,21 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -11090,21 +11090,21 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -11290,16 +11290,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_unordered_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("singlethread-one-as") unordered, align 4 @@ -11483,16 +11483,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("singlethread-one-as") monotonic, align 4 @@ -11676,16 +11676,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("singlethread-one-as") acquire, align 4 @@ -11869,16 +11869,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("singlethread-one-as") seq_cst, align 4 @@ -12030,15 +12030,15 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_unordered_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("singlethread-one-as") unordered, align 4 @@ -12189,15 +12189,15 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("singlethread-one-as") monotonic, align 4 @@ -12348,15 +12348,15 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("singlethread-one-as") release, align 4 @@ -12507,15 +12507,15 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("singlethread-one-as") seq_cst, align 4 @@ -12666,15 +12666,15 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") monotonic @@ -12825,15 +12825,15 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acquire @@ -12984,15 +12984,15 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") release @@ -13143,15 +13143,15 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acq_rel @@ -13302,15 +13302,15 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") seq_cst @@ -13505,17 +13505,17 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acquire @@ -13711,17 +13711,17 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acq_rel @@ -13917,17 +13917,17 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") seq_cst @@ -14167,19 +14167,19 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14419,19 +14419,19 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14671,19 +14671,19 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14923,19 +14923,19 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15175,19 +15175,19 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15427,19 +15427,19 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15679,19 +15679,19 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15931,19 +15931,19 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16183,19 +16183,19 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16435,19 +16435,19 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16687,19 +16687,19 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16939,19 +16939,19 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17191,19 +17191,19 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17443,19 +17443,19 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17695,19 +17695,19 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17991,21 +17991,21 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18291,21 +18291,21 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18591,21 +18591,21 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18891,21 +18891,21 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19191,21 +19191,21 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19491,21 +19491,21 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19791,21 +19791,21 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20091,21 +20091,21 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20391,21 +20391,21 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20691,21 +20691,21 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20991,21 +20991,21 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21291,21 +21291,21 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21591,21 +21591,21 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21891,21 +21891,21 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -22191,21 +22191,21 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll index 3c6ffdd1a6332..9d70a2437e553 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll @@ -11,7 +11,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @flat_system_unordered_load( ; GFX7-LABEL: flat_system_unordered_load: @@ -189,16 +189,16 @@ define amdgpu_kernel void @flat_system_unordered_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_unordered_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in unordered, align 4 @@ -382,16 +382,16 @@ define amdgpu_kernel void @flat_system_monotonic_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_monotonic_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in monotonic, align 4 @@ -592,17 +592,17 @@ define amdgpu_kernel void @flat_system_acquire_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_acquire_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in acquire, align 4 @@ -829,23 +829,23 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_seq_cst_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in seq_cst, align 4 @@ -997,15 +997,15 @@ define amdgpu_kernel void @flat_system_unordered_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_unordered_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out unordered, align 4 @@ -1156,15 +1156,15 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_monotonic_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out monotonic, align 4 @@ -1343,20 +1343,20 @@ define amdgpu_kernel void @flat_system_release_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out release, align 4 @@ -1535,20 +1535,20 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out seq_cst, align 4 @@ -1699,15 +1699,15 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_monotonic_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in monotonic @@ -1891,17 +1891,17 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_acquire_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in acquire @@ -2080,20 +2080,20 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in release @@ -2305,22 +2305,22 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in acq_rel @@ -2532,22 +2532,22 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in seq_cst @@ -2759,18 +2759,18 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_acquire_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in acquire @@ -3015,25 +3015,25 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in acq_rel @@ -3278,25 +3278,25 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in seq_cst @@ -3536,19 +3536,19 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_monotonic_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3821,21 +3821,21 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_acquire_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4103,24 +4103,24 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4421,26 +4421,26 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4741,26 +4741,26 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5033,21 +5033,21 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_monotonic_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5320,21 +5320,21 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_acquire_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5635,26 +5635,26 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5955,26 +5955,26 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6275,26 +6275,26 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6595,26 +6595,26 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_monotonic_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6915,26 +6915,26 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_acquire_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7235,26 +7235,26 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_release_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7555,26 +7555,26 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7875,26 +7875,26 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8178,21 +8178,21 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8495,22 +8495,22 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8824,26 +8824,26 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_release_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9178,29 +9178,29 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9535,29 +9535,29 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9864,24 +9864,24 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10184,22 +10184,22 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_acquire_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10534,29 +10534,29 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10891,29 +10891,29 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -11248,29 +11248,29 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -11605,29 +11605,29 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -11958,27 +11958,27 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -12313,29 +12313,29 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_release_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -12670,29 +12670,29 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -13027,29 +13027,29 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -13235,16 +13235,16 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_unordered_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("one-as") unordered, align 4 @@ -13428,16 +13428,16 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_monotonic_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("one-as") monotonic, align 4 @@ -13648,18 +13648,18 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_acquire_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("one-as") acquire, align 4 @@ -13896,24 +13896,24 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("one-as") seq_cst, align 4 @@ -14065,15 +14065,15 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_unordered_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("one-as") unordered, align 4 @@ -14224,15 +14224,15 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_monotonic_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("one-as") monotonic, align 4 @@ -14411,20 +14411,20 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("one-as") release, align 4 @@ -14603,20 +14603,20 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("one-as") seq_cst, align 4 @@ -14767,15 +14767,15 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_monotonic_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") monotonic @@ -14955,17 +14955,17 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_acquire_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acquire @@ -15144,20 +15144,20 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") release @@ -15365,22 +15365,22 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acq_rel @@ -15588,22 +15588,22 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") seq_cst @@ -15825,19 +15825,19 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_acquire_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acquire @@ -16092,26 +16092,26 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acq_rel @@ -16366,26 +16366,26 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") seq_cst @@ -16625,19 +16625,19 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16906,21 +16906,21 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17188,24 +17188,24 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17502,26 +17502,26 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17818,26 +17818,26 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18106,21 +18106,21 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18389,21 +18389,21 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18700,26 +18700,26 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19016,26 +19016,26 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19332,26 +19332,26 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19648,26 +19648,26 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19964,26 +19964,26 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20280,26 +20280,26 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20596,26 +20596,26 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20912,26 +20912,26 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21215,21 +21215,21 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21542,23 +21542,23 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21872,26 +21872,26 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -22236,30 +22236,30 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -22604,30 +22604,30 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -22944,25 +22944,25 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -23275,23 +23275,23 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -23636,30 +23636,30 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -24004,30 +24004,30 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -24372,30 +24372,30 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -24740,30 +24740,30 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -25104,28 +25104,28 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -25470,30 +25470,30 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -25838,30 +25838,30 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -26206,30 +26206,30 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll index e0cf88891e421..43f015c3a2e0f 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll @@ -7,7 +7,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @flat_nontemporal_load_0( ; GFX7-LABEL: flat_nontemporal_load_0: @@ -145,18 +145,18 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_nontemporal_load_0: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_nontemporal_load_0: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load volatile i32, ptr %in, align 4 @@ -430,22 +430,22 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_nontemporal_load_1: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_mov_b32 s4, 0x3ff -; GFX1250-CU-NEXT: v_and_b32_e64 v1, v1, s4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v1, s[2:3] scale_offset scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_nontemporal_load_1: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s4, 0x3ff +; GFX1250-NEXT: v_and_b32_e64 v1, v1, s4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v1, s[2:3] scale_offset scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -595,17 +595,17 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_nontemporal_store_0: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_nontemporal_store_0: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load i32, ptr %in, align 4 @@ -875,20 +875,20 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_nontemporal_store_1: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v1, s[2:3] -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_mov_b32 s2, 0x3ff -; GFX1250-CU-NEXT: v_and_b32_e64 v0, v0, s2 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scale_offset scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_nontemporal_store_1: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v1, s[2:3] +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s2, 0x3ff +; GFX1250-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scale_offset scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1030,16 +1030,16 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_volatile_workgroup_acquire_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_volatile_workgroup_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic volatile i32, ptr %in syncscope("workgroup") acquire, align 4 @@ -1160,16 +1160,16 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_volatile_workgroup_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_volatile_workgroup_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic volatile i32 %in, ptr %out syncscope("workgroup") release, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll index 16135f5fc5d6f..f086542b3d1f8 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll @@ -11,7 +11,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @flat_wavefront_unordered_load( ; GFX7-LABEL: flat_wavefront_unordered_load: @@ -189,16 +189,16 @@ define amdgpu_kernel void @flat_wavefront_unordered_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_unordered_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("wavefront") unordered, align 4 @@ -382,16 +382,16 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_monotonic_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("wavefront") monotonic, align 4 @@ -575,16 +575,16 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_acquire_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("wavefront") acquire, align 4 @@ -768,16 +768,16 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_seq_cst_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("wavefront") seq_cst, align 4 @@ -929,15 +929,15 @@ define amdgpu_kernel void @flat_wavefront_unordered_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_unordered_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("wavefront") unordered, align 4 @@ -1088,15 +1088,15 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_monotonic_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("wavefront") monotonic, align 4 @@ -1247,15 +1247,15 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("wavefront") release, align 4 @@ -1406,15 +1406,15 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("wavefront") seq_cst, align 4 @@ -1565,15 +1565,15 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_monotonic_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") monotonic @@ -1724,15 +1724,15 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_acquire_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acquire @@ -1883,15 +1883,15 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") release @@ -2042,15 +2042,15 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acq_rel @@ -2201,15 +2201,15 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") seq_cst @@ -2404,17 +2404,17 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_acquire_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acquire @@ -2610,17 +2610,17 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acq_rel @@ -2816,17 +2816,17 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") seq_cst @@ -3066,19 +3066,19 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3318,19 +3318,19 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3570,19 +3570,19 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3822,19 +3822,19 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4074,19 +4074,19 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4326,19 +4326,19 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4578,19 +4578,19 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_acquire_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4830,19 +4830,19 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5082,19 +5082,19 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5334,19 +5334,19 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5586,19 +5586,19 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5838,19 +5838,19 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6090,19 +6090,19 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_release_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6342,19 +6342,19 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6594,19 +6594,19 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6890,21 +6890,21 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7190,21 +7190,21 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7490,21 +7490,21 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7790,21 +7790,21 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8090,21 +8090,21 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8390,21 +8390,21 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8690,21 +8690,21 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8990,21 +8990,21 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9290,21 +9290,21 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9590,21 +9590,21 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9890,21 +9890,21 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10190,21 +10190,21 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10490,21 +10490,21 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10790,21 +10790,21 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -11090,21 +11090,21 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -11290,16 +11290,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_unordered_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("wavefront-one-as") unordered, align 4 @@ -11483,16 +11483,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("wavefront-one-as") monotonic, align 4 @@ -11676,16 +11676,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("wavefront-one-as") acquire, align 4 @@ -11869,16 +11869,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("wavefront-one-as") seq_cst, align 4 @@ -12030,15 +12030,15 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_unordered_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("wavefront-one-as") unordered, align 4 @@ -12189,15 +12189,15 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("wavefront-one-as") monotonic, align 4 @@ -12348,15 +12348,15 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("wavefront-one-as") release, align 4 @@ -12507,15 +12507,15 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("wavefront-one-as") seq_cst, align 4 @@ -12666,15 +12666,15 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") monotonic @@ -12825,15 +12825,15 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acquire @@ -12984,15 +12984,15 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") release @@ -13143,15 +13143,15 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acq_rel @@ -13302,15 +13302,15 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") seq_cst @@ -13505,17 +13505,17 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acquire @@ -13711,17 +13711,17 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acq_rel @@ -13917,17 +13917,17 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") seq_cst @@ -14167,19 +14167,19 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14419,19 +14419,19 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14671,19 +14671,19 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14923,19 +14923,19 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15175,19 +15175,19 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15427,19 +15427,19 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15679,19 +15679,19 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15931,19 +15931,19 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16183,19 +16183,19 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16435,19 +16435,19 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16687,19 +16687,19 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16939,19 +16939,19 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17191,19 +17191,19 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17443,19 +17443,19 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17695,19 +17695,19 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17991,21 +17991,21 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18291,21 +18291,21 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18591,21 +18591,21 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18891,21 +18891,21 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19191,21 +19191,21 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19491,21 +19491,21 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19791,21 +19791,21 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20091,21 +20091,21 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20391,21 +20391,21 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20691,21 +20691,21 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20991,21 +20991,21 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21291,21 +21291,21 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21591,21 +21591,21 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21891,21 +21891,21 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll index 18e6812ded962..d8e6ad043e061 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll @@ -11,7 +11,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @flat_workgroup_unordered_load( ; GFX7-LABEL: flat_workgroup_unordered_load: @@ -189,16 +189,16 @@ define amdgpu_kernel void @flat_workgroup_unordered_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_unordered_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("workgroup") unordered, align 4 @@ -382,16 +382,16 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_monotonic_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("workgroup") monotonic, align 4 @@ -587,16 +587,16 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_acquire_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("workgroup") acquire, align 4 @@ -811,17 +811,17 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_seq_cst_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("workgroup") seq_cst, align 4 @@ -973,15 +973,15 @@ define amdgpu_kernel void @flat_workgroup_unordered_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_unordered_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("workgroup") unordered, align 4 @@ -1132,15 +1132,15 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_monotonic_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("workgroup") monotonic, align 4 @@ -1308,16 +1308,16 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("workgroup") release, align 4 @@ -1485,16 +1485,16 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("workgroup") seq_cst, align 4 @@ -1645,15 +1645,15 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_monotonic_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") monotonic @@ -1823,16 +1823,16 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_acquire_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acquire @@ -2000,16 +2000,16 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") release @@ -2196,17 +2196,17 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acq_rel @@ -2393,17 +2393,17 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") seq_cst @@ -2610,17 +2610,17 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_acquire_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acquire @@ -2847,18 +2847,18 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acq_rel @@ -3085,18 +3085,18 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") seq_cst @@ -3336,19 +3336,19 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3607,20 +3607,20 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3877,20 +3877,20 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4166,21 +4166,21 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4456,21 +4456,21 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4729,20 +4729,20 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5001,20 +5001,20 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_acquire_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5290,21 +5290,21 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5580,21 +5580,21 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5870,21 +5870,21 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6160,21 +6160,21 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6458,21 +6458,21 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6770,21 +6770,21 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7087,22 +7087,22 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7419,22 +7419,22 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7751,22 +7751,22 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8066,21 +8066,21 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8378,21 +8378,21 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8709,22 +8709,22 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9041,22 +9041,22 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9373,22 +9373,22 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9705,22 +9705,22 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10035,22 +10035,22 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10367,22 +10367,22 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10699,22 +10699,22 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -11031,22 +11031,22 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -11232,16 +11232,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_unordered_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("workgroup-one-as") unordered, align 4 @@ -11425,16 +11425,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("workgroup-one-as") monotonic, align 4 @@ -11626,16 +11626,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("workgroup-one-as") acquire, align 4 @@ -11839,16 +11839,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("workgroup-one-as") seq_cst, align 4 @@ -12000,15 +12000,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_unordered_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("workgroup-one-as") unordered, align 4 @@ -12159,15 +12159,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("workgroup-one-as") monotonic, align 4 @@ -12328,15 +12328,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("workgroup-one-as") release, align 4 @@ -12497,15 +12497,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("workgroup-one-as") seq_cst, align 4 @@ -12656,15 +12656,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") monotonic @@ -12825,15 +12825,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acquire @@ -12994,15 +12994,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") release @@ -13173,15 +13173,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acq_rel @@ -13352,15 +13352,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") seq_cst @@ -13563,17 +13563,17 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acquire @@ -13789,17 +13789,17 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acq_rel @@ -14015,17 +14015,17 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") seq_cst @@ -14265,19 +14265,19 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14527,19 +14527,19 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14789,19 +14789,19 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15061,19 +15061,19 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15333,19 +15333,19 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15595,19 +15595,19 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15857,19 +15857,19 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16129,19 +16129,19 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16401,19 +16401,19 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16673,19 +16673,19 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16945,19 +16945,19 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17217,19 +17217,19 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17489,19 +17489,19 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17761,19 +17761,19 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18033,19 +18033,19 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18329,21 +18329,21 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18637,21 +18637,21 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18947,21 +18947,21 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19267,21 +19267,21 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19587,21 +19587,21 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19897,21 +19897,21 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20205,21 +20205,21 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20525,21 +20525,21 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20845,21 +20845,21 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21165,21 +21165,21 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21485,21 +21485,21 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21803,21 +21803,21 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -22123,21 +22123,21 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -22443,21 +22443,21 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -22763,21 +22763,21 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll index 51859c112bf9f..184e15406bfbc 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll @@ -12,7 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @global_agent_unordered_load( ; GFX6-LABEL: global_agent_unordered_load: @@ -192,16 +192,16 @@ define amdgpu_kernel void @global_agent_unordered_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_unordered_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("agent") unordered, align 4 @@ -387,16 +387,16 @@ define amdgpu_kernel void @global_agent_monotonic_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_monotonic_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("agent") monotonic, align 4 @@ -598,17 +598,17 @@ define amdgpu_kernel void @global_agent_acquire_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_acquire_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("agent") acquire, align 4 @@ -829,23 +829,23 @@ define amdgpu_kernel void @global_agent_seq_cst_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_seq_cst_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("agent") seq_cst, align 4 @@ -1004,15 +1004,15 @@ define amdgpu_kernel void @global_agent_unordered_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_unordered_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") unordered, align 4 @@ -1170,15 +1170,15 @@ define amdgpu_kernel void @global_agent_monotonic_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_monotonic_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") monotonic, align 4 @@ -1361,19 +1361,19 @@ define amdgpu_kernel void @global_agent_release_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") release, align 4 @@ -1556,19 +1556,19 @@ define amdgpu_kernel void @global_agent_seq_cst_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") seq_cst, align 4 @@ -1724,15 +1724,15 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_monotonic_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") monotonic @@ -1917,17 +1917,17 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_acquire_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") acquire @@ -2108,19 +2108,19 @@ define amdgpu_kernel void @global_agent_release_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") release @@ -2330,21 +2330,21 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") acq_rel @@ -2554,21 +2554,21 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst @@ -2768,18 +2768,18 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_acquire_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") acquire @@ -3009,24 +3009,24 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") acq_rel @@ -3256,24 +3256,24 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst @@ -3494,19 +3494,19 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_monotonic_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3756,21 +3756,21 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_acquire_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4016,23 +4016,23 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4307,25 +4307,25 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4600,25 +4600,25 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4868,21 +4868,21 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_monotonic_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5132,21 +5132,21 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_acquire_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5421,25 +5421,25 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5714,25 +5714,25 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6007,25 +6007,25 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6300,25 +6300,25 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_monotonic_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6593,25 +6593,25 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_acquire_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6886,25 +6886,25 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_release_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7179,25 +7179,25 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7472,25 +7472,25 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7741,21 +7741,21 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8024,22 +8024,22 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8317,25 +8317,25 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_release_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8633,28 +8633,28 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8952,28 +8952,28 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9246,24 +9246,24 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9532,22 +9532,22 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_acquire_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9845,28 +9845,28 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10164,28 +10164,28 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10483,28 +10483,28 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10802,28 +10802,28 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -11117,26 +11117,26 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -11434,28 +11434,28 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_release_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -11753,28 +11753,28 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -12072,28 +12072,28 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -12281,16 +12281,16 @@ define amdgpu_kernel void @global_agent_one_as_unordered_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_unordered_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("agent-one-as") unordered, align 4 @@ -12476,16 +12476,16 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_monotonic_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("agent-one-as") monotonic, align 4 @@ -12687,17 +12687,17 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_acquire_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("agent-one-as") acquire, align 4 @@ -12918,23 +12918,23 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("agent-one-as") seq_cst, align 4 @@ -13093,15 +13093,15 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_unordered_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("agent-one-as") unordered, align 4 @@ -13259,15 +13259,15 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_monotonic_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("agent-one-as") monotonic, align 4 @@ -13450,19 +13450,19 @@ define amdgpu_kernel void @global_agent_one_as_release_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("agent-one-as") release, align 4 @@ -13645,19 +13645,19 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("agent-one-as") seq_cst, align 4 @@ -13813,15 +13813,15 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_monotonic_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") monotonic @@ -14006,17 +14006,17 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_acquire_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") acquire @@ -14197,19 +14197,19 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") release @@ -14419,21 +14419,21 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") acq_rel @@ -14643,21 +14643,21 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") seq_cst @@ -14857,18 +14857,18 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_acquire_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") acquire @@ -15098,24 +15098,24 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") acq_rel @@ -15345,24 +15345,24 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") seq_cst @@ -15583,19 +15583,19 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15845,21 +15845,21 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16105,23 +16105,23 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16396,25 +16396,25 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16689,25 +16689,25 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16957,21 +16957,21 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17221,21 +17221,21 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17510,25 +17510,25 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17803,25 +17803,25 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18096,25 +18096,25 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18389,25 +18389,25 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18682,25 +18682,25 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18975,25 +18975,25 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19268,25 +19268,25 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19561,25 +19561,25 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19830,21 +19830,21 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20113,22 +20113,22 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20426,28 +20426,28 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20745,28 +20745,28 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -21039,24 +21039,24 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -21325,22 +21325,22 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -21638,28 +21638,28 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -21957,28 +21957,28 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -22276,28 +22276,28 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -22595,28 +22595,28 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -22910,26 +22910,26 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -23227,28 +23227,28 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -23546,28 +23546,28 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -23865,28 +23865,28 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll index 2c8fa9da74862..ed2d62356f8f2 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefix=GFX12 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefix=GFX12 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @global_last_use_load_0(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; GFX12-LABEL: global_last_use_load_0: @@ -16,17 +16,17 @@ define amdgpu_kernel void @global_last_use_load_0(ptr addrspace(1) %in, ptr addr ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_last_use_load_0: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_last_use_load_0: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm entry: %val = load i32, ptr addrspace(1) %in, align 4, !amdgpu.last.use !{} store i32 %val, ptr addrspace(1) %out @@ -51,20 +51,20 @@ define amdgpu_kernel void @global_last_use_load_1(ptr addrspace(1) %in, ptr addr ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_last_use_load_1: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_mov_b32 s4, 0x3ff -; GFX1250-CU-NEXT: v_and_b32_e64 v1, v1, s4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v1, s[2:3] scale_offset th:TH_LOAD_LU -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_last_use_load_1: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s4, 0x3ff +; GFX1250-NEXT: v_and_b32_e64 v1, v1, s4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v1, s[2:3] scale_offset th:TH_LOAD_LU +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %val.gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid @@ -87,18 +87,18 @@ define amdgpu_kernel void @global_last_use_and_volatile_load(ptr addrspace(1) %i ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_last_use_and_volatile_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_last_use_and_volatile_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm entry: %val = load volatile i32, ptr addrspace(1) %in, align 4, !amdgpu.last.use !{} store i32 %val, ptr addrspace(1) %out @@ -123,20 +123,20 @@ define amdgpu_kernel void @global_last_use_and_nontemporal_load(ptr addrspace(1) ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_last_use_and_nontemporal_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_mov_b32 s4, 0x3ff -; GFX1250-CU-NEXT: v_and_b32_e64 v1, v1, s4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v1, s[2:3] scale_offset th:TH_LOAD_LU -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_last_use_and_nontemporal_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s4, 0x3ff +; GFX1250-NEXT: v_and_b32_e64 v1, v1, s4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v1, s[2:3] scale_offset th:TH_LOAD_LU +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %val.gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll index e73300dbc5ac6..0ad64f5599fe7 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll @@ -12,7 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @global_nontemporal_load_0( ; GFX6-LABEL: global_nontemporal_load_0: @@ -191,17 +191,17 @@ define amdgpu_kernel void @global_nontemporal_load_0( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_nontemporal_load_0: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_nontemporal_load_0: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load i32, ptr addrspace(1) %in, align 4, !nontemporal !0 @@ -462,20 +462,20 @@ define amdgpu_kernel void @global_nontemporal_load_1( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_nontemporal_load_1: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_mov_b32 s4, 0x3ff -; GFX1250-CU-NEXT: v_and_b32_e64 v1, v1, s4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v1, s[2:3] scale_offset th:TH_LOAD_NT -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_nontemporal_load_1: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s4, 0x3ff +; GFX1250-NEXT: v_and_b32_e64 v1, v1, s4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v1, s[2:3] scale_offset th:TH_LOAD_NT +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -662,17 +662,17 @@ define amdgpu_kernel void @global_nontemporal_store_0( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] th:TH_STORE_NT ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_nontemporal_store_0: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] th:TH_STORE_NT -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_nontemporal_store_0: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] th:TH_STORE_NT +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load i32, ptr addrspace(1) %in, align 4 @@ -907,19 +907,19 @@ define amdgpu_kernel void @global_nontemporal_store_1( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] th:TH_STORE_NT ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_nontemporal_store_1: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_mov_b32 s3, 0x3ff -; GFX1250-CU-NEXT: v_and_b32_e64 v0, v0, s3 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset th:TH_STORE_NT -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_nontemporal_store_1: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s3, 0x3ff +; GFX1250-NEXT: v_and_b32_e64 v0, v0, s3 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset th:TH_STORE_NT +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1111,18 +1111,18 @@ define amdgpu_kernel void @global_nontemporal_volatile_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_nontemporal_volatile_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_nontemporal_volatile_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load volatile i32, ptr addrspace(1) %in, align 4, !nontemporal !0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll index 2633bba70ddd3..6a5a6e01c741b 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll @@ -12,7 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @global_singlethread_unordered_load( ; GFX6-LABEL: global_singlethread_unordered_load: @@ -192,16 +192,16 @@ define amdgpu_kernel void @global_singlethread_unordered_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_unordered_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread") unordered, align 4 @@ -387,16 +387,16 @@ define amdgpu_kernel void @global_singlethread_monotonic_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_monotonic_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread") monotonic, align 4 @@ -582,16 +582,16 @@ define amdgpu_kernel void @global_singlethread_acquire_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_acquire_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread") acquire, align 4 @@ -777,16 +777,16 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_seq_cst_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread") seq_cst, align 4 @@ -945,15 +945,15 @@ define amdgpu_kernel void @global_singlethread_unordered_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_unordered_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread") unordered, align 4 @@ -1111,15 +1111,15 @@ define amdgpu_kernel void @global_singlethread_monotonic_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_monotonic_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread") monotonic, align 4 @@ -1277,15 +1277,15 @@ define amdgpu_kernel void @global_singlethread_release_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread") release, align 4 @@ -1443,15 +1443,15 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread") seq_cst, align 4 @@ -1607,15 +1607,15 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_monotonic_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") monotonic @@ -1771,15 +1771,15 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_acquire_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") acquire @@ -1935,15 +1935,15 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") release @@ -2099,15 +2099,15 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") acq_rel @@ -2263,15 +2263,15 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") seq_cst @@ -2455,17 +2455,17 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_acquire_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") acquire @@ -2650,17 +2650,17 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") acq_rel @@ -2845,17 +2845,17 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") seq_cst @@ -3076,19 +3076,19 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3309,19 +3309,19 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_acquire_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3542,19 +3542,19 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3775,19 +3775,19 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4008,19 +4008,19 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4241,19 +4241,19 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_monotonic_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4474,19 +4474,19 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_acquire_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4707,19 +4707,19 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4940,19 +4940,19 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5173,19 +5173,19 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5406,19 +5406,19 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5639,19 +5639,19 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5872,19 +5872,19 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_release_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6105,19 +6105,19 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6338,19 +6338,19 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6601,21 +6601,21 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6868,21 +6868,21 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7135,21 +7135,21 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7402,21 +7402,21 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7669,21 +7669,21 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7936,21 +7936,21 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8203,21 +8203,21 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8470,21 +8470,21 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8737,21 +8737,21 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9004,21 +9004,21 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9271,21 +9271,21 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9538,21 +9538,21 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9805,21 +9805,21 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10072,21 +10072,21 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10339,21 +10339,21 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10541,16 +10541,16 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_unordered_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread-one-as") unordered, align 4 @@ -10736,16 +10736,16 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread-one-as") monotonic, align 4 @@ -10931,16 +10931,16 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread-one-as") acquire, align 4 @@ -11126,16 +11126,16 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread-one-as") seq_cst, align 4 @@ -11294,15 +11294,15 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_unordered_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread-one-as") unordered, align 4 @@ -11460,15 +11460,15 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread-one-as") monotonic, align 4 @@ -11626,15 +11626,15 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread-one-as") release, align 4 @@ -11792,15 +11792,15 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread-one-as") seq_cst, align 4 @@ -11956,15 +11956,15 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") monotonic @@ -12120,15 +12120,15 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") acquire @@ -12284,15 +12284,15 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") release @@ -12448,15 +12448,15 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") acq_rel @@ -12612,15 +12612,15 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") seq_cst @@ -12804,17 +12804,17 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") acquire @@ -12999,17 +12999,17 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") acq_rel @@ -13194,17 +13194,17 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") seq_cst @@ -13425,19 +13425,19 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -13658,19 +13658,19 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -13891,19 +13891,19 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14124,19 +14124,19 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14357,19 +14357,19 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14590,19 +14590,19 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14823,19 +14823,19 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15056,19 +15056,19 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15289,19 +15289,19 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15522,19 +15522,19 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15755,19 +15755,19 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15988,19 +15988,19 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16221,19 +16221,19 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16454,19 +16454,19 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16687,19 +16687,19 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16950,21 +16950,21 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17217,21 +17217,21 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17484,21 +17484,21 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17751,21 +17751,21 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18018,21 +18018,21 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18285,21 +18285,21 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18552,21 +18552,21 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18819,21 +18819,21 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19086,21 +19086,21 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19353,21 +19353,21 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19620,21 +19620,21 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19887,21 +19887,21 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20154,21 +20154,21 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20421,21 +20421,21 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20688,21 +20688,21 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll index c194b49f25255..7ddd515830e11 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll @@ -12,7 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @global_system_unordered_load( ; GFX6-LABEL: global_system_unordered_load: @@ -192,16 +192,16 @@ define amdgpu_kernel void @global_system_unordered_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_unordered_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in unordered, align 4 @@ -387,16 +387,16 @@ define amdgpu_kernel void @global_system_monotonic_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_monotonic_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in monotonic, align 4 @@ -600,17 +600,17 @@ define amdgpu_kernel void @global_system_acquire_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_acquire_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in acquire, align 4 @@ -833,23 +833,23 @@ define amdgpu_kernel void @global_system_seq_cst_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_seq_cst_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in seq_cst, align 4 @@ -1008,15 +1008,15 @@ define amdgpu_kernel void @global_system_unordered_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_unordered_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out unordered, align 4 @@ -1174,15 +1174,15 @@ define amdgpu_kernel void @global_system_monotonic_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_monotonic_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out monotonic, align 4 @@ -1369,20 +1369,20 @@ define amdgpu_kernel void @global_system_release_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out release, align 4 @@ -1569,20 +1569,20 @@ define amdgpu_kernel void @global_system_seq_cst_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out seq_cst, align 4 @@ -1738,15 +1738,15 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_monotonic_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in monotonic @@ -1933,17 +1933,17 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_acquire_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acquire @@ -2128,20 +2128,20 @@ define amdgpu_kernel void @global_system_release_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in release @@ -2357,22 +2357,22 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acq_rel @@ -2588,22 +2588,22 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in seq_cst @@ -2805,18 +2805,18 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_acquire_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acquire @@ -3052,25 +3052,25 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acq_rel @@ -3306,25 +3306,25 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in seq_cst @@ -3545,19 +3545,19 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_monotonic_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3809,21 +3809,21 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_acquire_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4073,24 +4073,24 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4371,26 +4371,26 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4671,26 +4671,26 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4942,21 +4942,21 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_monotonic_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5208,21 +5208,21 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_acquire_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5503,26 +5503,26 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5803,26 +5803,26 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6103,26 +6103,26 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6403,26 +6403,26 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6673,21 +6673,21 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6958,22 +6958,22 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_acquire_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7277,29 +7277,29 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7603,29 +7603,29 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7900,24 +7900,24 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_monotonic_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8188,22 +8188,22 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_acquire_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8507,29 +8507,29 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8833,29 +8833,29 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9159,29 +9159,29 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9485,29 +9485,29 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9807,27 +9807,27 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10131,29 +10131,29 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_relese_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_relese_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10457,29 +10457,29 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10783,29 +10783,29 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10993,16 +10993,16 @@ define amdgpu_kernel void @global_system_one_as_unordered_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_unordered_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("one-as") unordered, align 4 @@ -11188,16 +11188,16 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_monotonic_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("one-as") monotonic, align 4 @@ -11401,17 +11401,17 @@ define amdgpu_kernel void @global_system_one_as_acquire_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_acquire_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("one-as") acquire, align 4 @@ -11634,23 +11634,23 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_seq_cst_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("one-as") seq_cst, align 4 @@ -11809,15 +11809,15 @@ define amdgpu_kernel void @global_system_one_as_unordered_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_unordered_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") unordered, align 4 @@ -11975,15 +11975,15 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_monotonic_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") monotonic, align 4 @@ -12170,20 +12170,20 @@ define amdgpu_kernel void @global_system_one_as_release_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") release, align 4 @@ -12370,20 +12370,20 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") seq_cst, align 4 @@ -12539,15 +12539,15 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_monotonic_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") monotonic @@ -12734,17 +12734,17 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_acquire_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acquire @@ -12929,20 +12929,20 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") release @@ -13158,22 +13158,22 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acq_rel @@ -13389,22 +13389,22 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") seq_cst @@ -13606,18 +13606,18 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_acquire_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acquire @@ -13853,25 +13853,25 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acq_rel @@ -14107,25 +14107,25 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") seq_cst @@ -14346,19 +14346,19 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14610,21 +14610,21 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14874,24 +14874,24 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15172,26 +15172,26 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15472,26 +15472,26 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15743,21 +15743,21 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16009,21 +16009,21 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_acquire_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16304,26 +16304,26 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16604,26 +16604,26 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16904,26 +16904,26 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17204,26 +17204,26 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17504,26 +17504,26 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17804,26 +17804,26 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_release_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18104,26 +18104,26 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18404,26 +18404,26 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18674,21 +18674,21 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18959,22 +18959,22 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19256,26 +19256,26 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19579,29 +19579,29 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19905,29 +19905,29 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20202,24 +20202,24 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20490,22 +20490,22 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20809,29 +20809,29 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -21135,29 +21135,29 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -21461,29 +21461,29 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -21787,29 +21787,29 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -22109,27 +22109,27 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -22433,29 +22433,29 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -22759,29 +22759,29 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -23085,29 +23085,29 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll index 10d9ee0617a0e..0d18963cbfb68 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll @@ -8,7 +8,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @global_volatile_load_0( ; GFX6-LABEL: global_volatile_load_0: @@ -148,18 +148,18 @@ define amdgpu_kernel void @global_volatile_load_0( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_volatile_load_0: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_volatile_load_0: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load volatile i32, ptr addrspace(1) %in, align 4 @@ -360,22 +360,22 @@ define amdgpu_kernel void @global_volatile_load_1( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_volatile_load_1: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_mov_b32 s4, 0x3ff -; GFX1250-CU-NEXT: v_and_b32_e64 v1, v1, s4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v1, s[2:3] scale_offset scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-CU-NEXT: s_wait_samplecnt 0x0 -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_volatile_load_1: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s4, 0x3ff +; GFX1250-NEXT: v_and_b32_e64 v1, v1, s4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v1, s[2:3] scale_offset scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -533,18 +533,18 @@ define amdgpu_kernel void @global_volatile_store_0( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_volatile_store_0: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_volatile_store_0: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load i32, ptr addrspace(1) %in, align 4 @@ -738,20 +738,20 @@ define amdgpu_kernel void @global_volatile_store_1( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_volatile_store_1: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_mov_b32 s3, 0x3ff -; GFX1250-CU-NEXT: v_and_b32_e64 v0, v0, s3 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset scope:SCOPE_SYS -; GFX1250-CU-NEXT: s_wait_storecnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_volatile_store_1: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s3, 0x3ff +; GFX1250-NEXT: v_and_b32_e64 v0, v0, s3 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -898,16 +898,16 @@ define amdgpu_kernel void @global_volatile_workgroup_acquire_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_volatile_workgroup_acquire_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_volatile_workgroup_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic volatile i32, ptr addrspace(1) %in syncscope("workgroup") acquire, align 4 @@ -1040,16 +1040,16 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_volatile_workgroup_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_volatile_workgroup_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic volatile i32 %in, ptr addrspace(1) %out syncscope("workgroup") release, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll index f64b283edf43f..1aa8305b1a837 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll @@ -12,7 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @global_wavefront_unordered_load( ; GFX6-LABEL: global_wavefront_unordered_load: @@ -192,16 +192,16 @@ define amdgpu_kernel void @global_wavefront_unordered_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_unordered_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront") unordered, align 4 @@ -387,16 +387,16 @@ define amdgpu_kernel void @global_wavefront_monotonic_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_monotonic_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront") monotonic, align 4 @@ -582,16 +582,16 @@ define amdgpu_kernel void @global_wavefront_acquire_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_acquire_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront") acquire, align 4 @@ -777,16 +777,16 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_seq_cst_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront") seq_cst, align 4 @@ -945,15 +945,15 @@ define amdgpu_kernel void @global_wavefront_unordered_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_unordered_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") unordered, align 4 @@ -1111,15 +1111,15 @@ define amdgpu_kernel void @global_wavefront_monotonic_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_monotonic_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") monotonic, align 4 @@ -1277,15 +1277,15 @@ define amdgpu_kernel void @global_wavefront_release_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") release, align 4 @@ -1443,15 +1443,15 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") seq_cst, align 4 @@ -1607,15 +1607,15 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_monotonic_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") monotonic @@ -1771,15 +1771,15 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_acquire_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acquire @@ -1935,15 +1935,15 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") release @@ -2099,15 +2099,15 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acq_rel @@ -2263,15 +2263,15 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") seq_cst @@ -2455,17 +2455,17 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_acquire_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acquire @@ -2650,17 +2650,17 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acq_rel @@ -2845,17 +2845,17 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") seq_cst @@ -3076,19 +3076,19 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3309,19 +3309,19 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_acquire_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3542,19 +3542,19 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3775,19 +3775,19 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4008,19 +4008,19 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4241,19 +4241,19 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_monotonic_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4474,19 +4474,19 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_acquire_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4707,19 +4707,19 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4940,19 +4940,19 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5173,19 +5173,19 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5406,19 +5406,19 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5639,19 +5639,19 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5872,19 +5872,19 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_release_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6105,19 +6105,19 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6338,19 +6338,19 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6601,21 +6601,21 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6868,21 +6868,21 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7135,21 +7135,21 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7402,21 +7402,21 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7669,21 +7669,21 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7936,21 +7936,21 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8203,21 +8203,21 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8470,21 +8470,21 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8737,21 +8737,21 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9004,21 +9004,21 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9271,21 +9271,21 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9538,21 +9538,21 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9805,21 +9805,21 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10072,21 +10072,21 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10339,21 +10339,21 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10541,16 +10541,16 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_unordered_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront-one-as") unordered, align 4 @@ -10736,16 +10736,16 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront-one-as") monotonic, align 4 @@ -10931,16 +10931,16 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront-one-as") acquire, align 4 @@ -11126,16 +11126,16 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront-one-as") seq_cst, align 4 @@ -11294,15 +11294,15 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_unordered_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") unordered, align 4 @@ -11460,15 +11460,15 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") monotonic, align 4 @@ -11626,15 +11626,15 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") release, align 4 @@ -11792,15 +11792,15 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") seq_cst, align 4 @@ -11956,15 +11956,15 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") monotonic @@ -12120,15 +12120,15 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acquire @@ -12284,15 +12284,15 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") release @@ -12448,15 +12448,15 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acq_rel @@ -12612,15 +12612,15 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") seq_cst @@ -12804,17 +12804,17 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acquire @@ -12999,17 +12999,17 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acq_rel @@ -13194,17 +13194,17 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") seq_cst @@ -13425,19 +13425,19 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -13658,19 +13658,19 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -13891,19 +13891,19 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14124,19 +14124,19 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14357,19 +14357,19 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14590,19 +14590,19 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14823,19 +14823,19 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15056,19 +15056,19 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15289,19 +15289,19 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15522,19 +15522,19 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15755,19 +15755,19 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15988,19 +15988,19 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16221,19 +16221,19 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16454,19 +16454,19 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16687,19 +16687,19 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16950,21 +16950,21 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17217,21 +17217,21 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17484,21 +17484,21 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17751,21 +17751,21 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18018,21 +18018,21 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18285,21 +18285,21 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18552,21 +18552,21 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18819,21 +18819,21 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19086,21 +19086,21 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19353,21 +19353,21 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19620,21 +19620,21 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19887,21 +19887,21 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20154,21 +20154,21 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20421,21 +20421,21 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20688,21 +20688,21 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll index c1879c8eb11af..3eab16e6b9713 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll @@ -12,7 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @global_workgroup_unordered_load( ; GFX6-LABEL: global_workgroup_unordered_load: @@ -192,16 +192,16 @@ define amdgpu_kernel void @global_workgroup_unordered_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_unordered_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup") unordered, align 4 @@ -387,16 +387,16 @@ define amdgpu_kernel void @global_workgroup_monotonic_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_monotonic_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup") monotonic, align 4 @@ -587,16 +587,16 @@ define amdgpu_kernel void @global_workgroup_acquire_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_acquire_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup") acquire, align 4 @@ -799,17 +799,17 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_seq_cst_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup") seq_cst, align 4 @@ -968,15 +968,15 @@ define amdgpu_kernel void @global_workgroup_unordered_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_unordered_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup") unordered, align 4 @@ -1134,15 +1134,15 @@ define amdgpu_kernel void @global_workgroup_monotonic_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_monotonic_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup") monotonic, align 4 @@ -1318,16 +1318,16 @@ define amdgpu_kernel void @global_workgroup_release_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup") release, align 4 @@ -1503,16 +1503,16 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup") seq_cst, align 4 @@ -1668,15 +1668,15 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_monotonic_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") monotonic @@ -1842,15 +1842,15 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_acquire_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") acquire @@ -2024,16 +2024,16 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") release @@ -2217,16 +2217,16 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") acq_rel @@ -2410,16 +2410,16 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst @@ -2608,17 +2608,17 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_acquire_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") acquire @@ -2828,18 +2828,18 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") acq_rel @@ -3049,18 +3049,18 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst @@ -3281,19 +3281,19 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3524,19 +3524,19 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_acquire_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3775,20 +3775,20 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4037,20 +4037,20 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4299,20 +4299,20 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4543,19 +4543,19 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_monotonic_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4786,19 +4786,19 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_acquire_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5047,20 +5047,20 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5309,20 +5309,20 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5571,20 +5571,20 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5833,20 +5833,20 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6095,20 +6095,20 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6357,20 +6357,20 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_release_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6619,20 +6619,20 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6881,20 +6881,20 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7145,21 +7145,21 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7417,21 +7417,21 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7702,22 +7702,22 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7995,22 +7995,22 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8288,22 +8288,22 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8563,21 +8563,21 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8835,21 +8835,21 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9127,22 +9127,22 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9420,22 +9420,22 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9713,22 +9713,22 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10006,22 +10006,22 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10297,22 +10297,22 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10590,22 +10590,22 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10883,22 +10883,22 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -11176,22 +11176,22 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -11379,16 +11379,16 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_unordered_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup-one-as") unordered, align 4 @@ -11574,16 +11574,16 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup-one-as") monotonic, align 4 @@ -11774,16 +11774,16 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup-one-as") acquire, align 4 @@ -11982,16 +11982,16 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup-one-as") seq_cst, align 4 @@ -12150,15 +12150,15 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_unordered_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup-one-as") unordered, align 4 @@ -12316,15 +12316,15 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup-one-as") monotonic, align 4 @@ -12492,15 +12492,15 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup-one-as") release, align 4 @@ -12668,15 +12668,15 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup-one-as") seq_cst, align 4 @@ -12832,15 +12832,15 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") monotonic @@ -13006,15 +13006,15 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") acquire @@ -13180,15 +13180,15 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") release @@ -13364,15 +13364,15 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") acq_rel @@ -13548,15 +13548,15 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") seq_cst @@ -13745,17 +13745,17 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") acquire @@ -13957,17 +13957,17 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") acq_rel @@ -14169,17 +14169,17 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") seq_cst @@ -14400,19 +14400,19 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14643,19 +14643,19 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14886,19 +14886,19 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15139,19 +15139,19 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15392,19 +15392,19 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15635,19 +15635,19 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15878,19 +15878,19 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16131,19 +16131,19 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16384,19 +16384,19 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16637,19 +16637,19 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16890,19 +16890,19 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17143,19 +17143,19 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17396,19 +17396,19 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17649,19 +17649,19 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17902,19 +17902,19 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18165,21 +18165,21 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18437,21 +18437,21 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18714,21 +18714,21 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18998,21 +18998,21 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19282,21 +19282,21 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19556,21 +19556,21 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19828,21 +19828,21 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20112,21 +20112,21 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20396,21 +20396,21 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20680,21 +20680,21 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20964,21 +20964,21 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -21246,21 +21246,21 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -21530,21 +21530,21 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -21814,21 +21814,21 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -22098,21 +22098,21 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN -; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll index fe703f5e8c90f..102616b9a2065 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll @@ -12,7 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @local_agent_unordered_load( ; GFX6-LABEL: local_agent_unordered_load: @@ -179,17 +179,17 @@ define amdgpu_kernel void @local_agent_unordered_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_unordered_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: ds_load_b32 v1, v0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("agent") unordered, align 4 @@ -362,17 +362,17 @@ define amdgpu_kernel void @local_agent_monotonic_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_monotonic_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: ds_load_b32 v1, v0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("agent") monotonic, align 4 @@ -550,17 +550,17 @@ define amdgpu_kernel void @local_agent_acquire_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_acquire_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: ds_load_b32 v1, v0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("agent") acquire, align 4 @@ -756,18 +756,18 @@ define amdgpu_kernel void @local_agent_seq_cst_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_seq_cst_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_load_b32 v1, v0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("agent") seq_cst, align 4 @@ -910,15 +910,15 @@ define amdgpu_kernel void @local_agent_unordered_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_unordered_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("agent") unordered, align 4 @@ -1060,15 +1060,15 @@ define amdgpu_kernel void @local_agent_monotonic_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_monotonic_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("agent") monotonic, align 4 @@ -1228,16 +1228,16 @@ define amdgpu_kernel void @local_agent_release_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("agent") release, align 4 @@ -1397,16 +1397,16 @@ define amdgpu_kernel void @local_agent_seq_cst_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("agent") seq_cst, align 4 @@ -1548,15 +1548,15 @@ define amdgpu_kernel void @local_agent_monotonic_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_monotonic_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") monotonic @@ -1714,16 +1714,16 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_acquire_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") acquire @@ -1883,16 +1883,16 @@ define amdgpu_kernel void @local_agent_release_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") release @@ -2068,17 +2068,17 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") acq_rel @@ -2254,17 +2254,17 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") seq_cst @@ -2453,18 +2453,18 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_acquire_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") acquire @@ -2672,19 +2672,19 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") acq_rel @@ -2892,19 +2892,19 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") seq_cst @@ -3073,17 +3073,17 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_monotonic_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3268,18 +3268,18 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_acquire_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3466,18 +3466,18 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3680,19 +3680,19 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3895,19 +3895,19 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4092,18 +4092,18 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_monotonic_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4288,18 +4288,18 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_acquire_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4502,19 +4502,19 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4717,19 +4717,19 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4932,19 +4932,19 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5147,19 +5147,19 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_monotonic_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5362,19 +5362,19 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_acquire_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5577,19 +5577,19 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_release_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5792,19 +5792,19 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6007,19 +6007,19 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6230,20 +6230,20 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6461,20 +6461,20 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6705,21 +6705,21 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_release_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6955,21 +6955,21 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7205,21 +7205,21 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7437,20 +7437,20 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_monotonic_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7668,20 +7668,20 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_acquire_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7917,21 +7917,21 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8167,21 +8167,21 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8417,21 +8417,21 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8667,21 +8667,21 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8917,21 +8917,21 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -9167,21 +9167,21 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_release_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -9417,21 +9417,21 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -9667,21 +9667,21 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -9856,17 +9856,17 @@ define amdgpu_kernel void @local_agent_one_as_unordered_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_one_as_unordered_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: ds_load_b32 v1, v0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_one_as_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("agent-one-as") unordered, align 4 @@ -10039,17 +10039,17 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_one_as_monotonic_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: ds_load_b32 v1, v0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_one_as_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("agent-one-as") monotonic, align 4 @@ -10222,17 +10222,17 @@ define amdgpu_kernel void @local_agent_one_as_acquire_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_one_as_acquire_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: ds_load_b32 v1, v0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_one_as_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("agent-one-as") acquire, align 4 @@ -10405,17 +10405,17 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_one_as_seq_cst_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: ds_load_b32 v1, v0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_one_as_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("agent-one-as") seq_cst, align 4 @@ -10558,15 +10558,15 @@ define amdgpu_kernel void @local_agent_one_as_unordered_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_one_as_unordered_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_one_as_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("agent-one-as") unordered, align 4 @@ -10708,15 +10708,15 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_one_as_monotonic_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_one_as_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("agent-one-as") monotonic, align 4 @@ -10858,15 +10858,15 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_one_as_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_one_as_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("agent-one-as") release, align 4 @@ -11008,15 +11008,15 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_one_as_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_one_as_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("agent-one-as") seq_cst, align 4 @@ -11158,15 +11158,15 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_one_as_monotonic_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_one_as_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent-one-as") monotonic @@ -11308,15 +11308,15 @@ define amdgpu_kernel void @local_agent_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_one_as_acquire_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_one_as_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent-one-as") acquire @@ -11458,15 +11458,15 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_one_as_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_one_as_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent-one-as") release @@ -11608,15 +11608,15 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_one_as_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_one_as_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent-one-as") acq_rel @@ -11758,15 +11758,15 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_one_as_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_one_as_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent-one-as") seq_cst @@ -11950,18 +11950,18 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_one_as_acquire_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_one_as_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent-one-as") acquire @@ -12146,18 +12146,18 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent-one-as") acq_rel @@ -12342,18 +12342,18 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent-one-as") seq_cst @@ -12522,17 +12522,17 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12701,17 +12701,17 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12880,17 +12880,17 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_one_as_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_one_as_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13059,17 +13059,17 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13238,17 +13238,17 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13417,17 +13417,17 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13596,17 +13596,17 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13775,17 +13775,17 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_one_as_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_one_as_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13954,17 +13954,17 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14133,17 +14133,17 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14312,17 +14312,17 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14491,17 +14491,17 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14670,17 +14670,17 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14849,17 +14849,17 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15028,17 +15028,17 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15249,20 +15249,20 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15475,20 +15475,20 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15701,20 +15701,20 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15927,20 +15927,20 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16153,20 +16153,20 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16379,20 +16379,20 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16605,20 +16605,20 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16831,20 +16831,20 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -17057,20 +17057,20 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -17283,20 +17283,20 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -17509,20 +17509,20 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -17735,20 +17735,20 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -17961,20 +17961,20 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -18187,20 +18187,20 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -18413,20 +18413,20 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll index 689932469d78d..c6f7ce51f5ea2 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll @@ -12,7 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @local_nontemporal_load_0( ; GFX6-LABEL: local_nontemporal_load_0: @@ -195,17 +195,17 @@ define amdgpu_kernel void @local_nontemporal_load_0( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_nontemporal_load_0: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ds_load_b32 v1, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_nontemporal_load_0: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ds_load_b32 v1, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(1) %out) { entry: %val = load i32, ptr addrspace(3) %in, align 4, !nontemporal !0 @@ -442,21 +442,21 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_nontemporal_load_1: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_mov_b32 s2, 0x3ff -; GFX1250-CU-NEXT: v_and_b32_e64 v1, v1, s2 -; GFX1250-CU-NEXT: s_mov_b32 s2, 2 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_lshl_add_u32 v1, v1, s2, s3 -; GFX1250-CU-NEXT: ds_load_b32 v1, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_nontemporal_load_1: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_mov_b32 s2, 0x3ff +; GFX1250-NEXT: v_and_b32_e64 v1, v1, s2 +; GFX1250-NEXT: s_mov_b32 s2, 2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshl_add_u32 v1, v1, s2, s3 +; GFX1250-NEXT: ds_load_b32 v1, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(1) %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -627,17 +627,17 @@ define amdgpu_kernel void @local_nontemporal_store_0( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_nontemporal_store_0: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_nontemporal_store_0: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(3) %out) { entry: %val = load i32, ptr addrspace(1) %in, align 4 @@ -844,21 +844,21 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_nontemporal_store_1: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_mov_b32 s1, 0x3ff -; GFX1250-CU-NEXT: v_and_b32_e64 v0, v0, s1 -; GFX1250-CU-NEXT: s_mov_b32 s1, 2 -; GFX1250-CU-NEXT: v_lshl_add_u32 v0, v0, s1, s2 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_nontemporal_store_1: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s1, 0x3ff +; GFX1250-NEXT: v_and_b32_e64 v0, v0, s1 +; GFX1250-NEXT: s_mov_b32 s1, 2 +; GFX1250-NEXT: v_lshl_add_u32 v0, v0, s1, s2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(3) %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1049,17 +1049,17 @@ define amdgpu_kernel void @local_nontemporal_volatile_load( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_nontemporal_volatile_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ds_load_b32 v1, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_nontemporal_volatile_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ds_load_b32 v1, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(1) %out) { entry: %val = load volatile i32, ptr addrspace(3) %in, align 4, !nontemporal !0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll index 97c80ece2b053..1800acbbf605b 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll @@ -12,7 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @local_singlethread_unordered_load( ; GFX6-LABEL: local_singlethread_unordered_load: @@ -179,17 +179,17 @@ define amdgpu_kernel void @local_singlethread_unordered_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_unordered_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: ds_load_b32 v1, v0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("singlethread") unordered, align 4 @@ -362,17 +362,17 @@ define amdgpu_kernel void @local_singlethread_monotonic_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_monotonic_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: ds_load_b32 v1, v0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("singlethread") monotonic, align 4 @@ -545,17 +545,17 @@ define amdgpu_kernel void @local_singlethread_acquire_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_acquire_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: ds_load_b32 v1, v0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("singlethread") acquire, align 4 @@ -728,17 +728,17 @@ define amdgpu_kernel void @local_singlethread_seq_cst_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_seq_cst_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: ds_load_b32 v1, v0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("singlethread") seq_cst, align 4 @@ -881,15 +881,15 @@ define amdgpu_kernel void @local_singlethread_unordered_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_unordered_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("singlethread") unordered, align 4 @@ -1031,15 +1031,15 @@ define amdgpu_kernel void @local_singlethread_monotonic_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_monotonic_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("singlethread") monotonic, align 4 @@ -1181,15 +1181,15 @@ define amdgpu_kernel void @local_singlethread_release_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("singlethread") release, align 4 @@ -1331,15 +1331,15 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("singlethread") seq_cst, align 4 @@ -1481,15 +1481,15 @@ define amdgpu_kernel void @local_singlethread_monotonic_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_monotonic_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread") monotonic @@ -1631,15 +1631,15 @@ define amdgpu_kernel void @local_singlethread_acquire_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_acquire_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread") acquire @@ -1781,15 +1781,15 @@ define amdgpu_kernel void @local_singlethread_release_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread") release @@ -1931,15 +1931,15 @@ define amdgpu_kernel void @local_singlethread_acq_rel_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread") acq_rel @@ -2081,15 +2081,15 @@ define amdgpu_kernel void @local_singlethread_seq_cst_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread") seq_cst @@ -2273,18 +2273,18 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_acquire_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread") acquire @@ -2469,18 +2469,18 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread") acq_rel @@ -2665,18 +2665,18 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread") seq_cst @@ -2845,17 +2845,17 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_monotonic_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3024,17 +3024,17 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_acquire_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3203,17 +3203,17 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3382,17 +3382,17 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3561,17 +3561,17 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3740,17 +3740,17 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_monotonic_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3919,17 +3919,17 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_acquire_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4098,17 +4098,17 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4277,17 +4277,17 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4456,17 +4456,17 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4635,17 +4635,17 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4814,17 +4814,17 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4993,17 +4993,17 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_release_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5172,17 +5172,17 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5351,17 +5351,17 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5572,20 +5572,20 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5798,20 +5798,20 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6024,20 +6024,20 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_release_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6250,20 +6250,20 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6476,20 +6476,20 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6702,20 +6702,20 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6928,20 +6928,20 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7154,20 +7154,20 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7380,20 +7380,20 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7606,20 +7606,20 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7832,20 +7832,20 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8058,20 +8058,20 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8284,20 +8284,20 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8510,20 +8510,20 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8736,20 +8736,20 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8924,17 +8924,17 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_one_as_unordered_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: ds_load_b32 v1, v0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_one_as_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("singlethread-one-as") unordered, align 4 @@ -9107,17 +9107,17 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_one_as_monotonic_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: ds_load_b32 v1, v0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_one_as_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("singlethread-one-as") monotonic, align 4 @@ -9290,17 +9290,17 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_one_as_acquire_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: ds_load_b32 v1, v0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_one_as_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("singlethread-one-as") acquire, align 4 @@ -9473,17 +9473,17 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_one_as_seq_cst_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: ds_load_b32 v1, v0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_one_as_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("singlethread-one-as") seq_cst, align 4 @@ -9626,15 +9626,15 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_one_as_unordered_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_one_as_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("singlethread-one-as") unordered, align 4 @@ -9776,15 +9776,15 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_one_as_monotonic_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_one_as_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("singlethread-one-as") monotonic, align 4 @@ -9926,15 +9926,15 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_one_as_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_one_as_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("singlethread-one-as") release, align 4 @@ -10076,15 +10076,15 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_one_as_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_one_as_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("singlethread-one-as") seq_cst, align 4 @@ -10226,15 +10226,15 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_one_as_monotonic_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_one_as_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread-one-as") monotonic @@ -10376,15 +10376,15 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_one_as_acquire_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_one_as_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread-one-as") acquire @@ -10526,15 +10526,15 @@ define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_one_as_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_one_as_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread-one-as") release @@ -10676,15 +10676,15 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread-one-as") acq_rel @@ -10826,15 +10826,15 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread-one-as") seq_cst @@ -11018,18 +11018,18 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread-one-as") acquire @@ -11214,18 +11214,18 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread-one-as") acq_rel @@ -11410,18 +11410,18 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread-one-as") seq_cst @@ -11590,17 +11590,17 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_cmpxchg ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -11769,17 +11769,17 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -11948,17 +11948,17 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12127,17 +12127,17 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12306,17 +12306,17 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12485,17 +12485,17 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12664,17 +12664,17 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12843,17 +12843,17 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13022,17 +13022,17 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13201,17 +13201,17 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13380,17 +13380,17 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13559,17 +13559,17 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13738,17 +13738,17 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13917,17 +13917,17 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14096,17 +14096,17 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14317,20 +14317,20 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_ret_cmp ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14543,20 +14543,20 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14769,20 +14769,20 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14995,20 +14995,20 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15221,20 +15221,20 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15447,20 +15447,20 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15673,20 +15673,20 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15899,20 +15899,20 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16125,20 +16125,20 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16351,20 +16351,20 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16577,20 +16577,20 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16803,20 +16803,20 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -17029,20 +17029,20 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -17255,20 +17255,20 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -17481,20 +17481,20 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll index fdf69a5998652..1356fe4854170 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll @@ -12,7 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @local_system_unordered_load( ; GFX6-LABEL: local_system_unordered_load: @@ -179,17 +179,17 @@ define amdgpu_kernel void @local_system_unordered_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_unordered_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: ds_load_b32 v1, v0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in unordered, align 4 @@ -362,17 +362,17 @@ define amdgpu_kernel void @local_system_monotonic_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_monotonic_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: ds_load_b32 v1, v0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in monotonic, align 4 @@ -550,17 +550,17 @@ define amdgpu_kernel void @local_system_acquire_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_acquire_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: ds_load_b32 v1, v0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in acquire, align 4 @@ -756,18 +756,18 @@ define amdgpu_kernel void @local_system_seq_cst_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_seq_cst_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_load_b32 v1, v0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in seq_cst, align 4 @@ -910,15 +910,15 @@ define amdgpu_kernel void @local_system_unordered_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_unordered_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out unordered, align 4 @@ -1060,15 +1060,15 @@ define amdgpu_kernel void @local_system_monotonic_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_monotonic_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out monotonic, align 4 @@ -1228,16 +1228,16 @@ define amdgpu_kernel void @local_system_release_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out release, align 4 @@ -1397,16 +1397,16 @@ define amdgpu_kernel void @local_system_seq_cst_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out seq_cst, align 4 @@ -1548,15 +1548,15 @@ define amdgpu_kernel void @local_system_monotonic_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_monotonic_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in monotonic @@ -1714,16 +1714,16 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_acquire_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in acquire @@ -1883,16 +1883,16 @@ define amdgpu_kernel void @local_system_release_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in release @@ -2068,17 +2068,17 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in acq_rel @@ -2254,17 +2254,17 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in seq_cst @@ -2453,18 +2453,18 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_acquire_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in acquire @@ -2672,19 +2672,19 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in acq_rel @@ -2892,19 +2892,19 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in seq_cst @@ -3073,17 +3073,17 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_monotonic_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3268,18 +3268,18 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_acquire_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3466,18 +3466,18 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3680,19 +3680,19 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3895,19 +3895,19 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4092,18 +4092,18 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_monotonic_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4288,18 +4288,18 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_acquire_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4502,19 +4502,19 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4717,19 +4717,19 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4932,19 +4932,19 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5147,19 +5147,19 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_monotonic_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5362,19 +5362,19 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_acquire_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5577,19 +5577,19 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_release_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5792,19 +5792,19 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_acq_rel_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6007,19 +6007,19 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6230,20 +6230,20 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_monotonic_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6461,20 +6461,20 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_acquire_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6705,21 +6705,21 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_release_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6955,21 +6955,21 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7205,21 +7205,21 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7437,20 +7437,20 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_monotonic_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7668,20 +7668,20 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_acquire_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7917,21 +7917,21 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8167,21 +8167,21 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8417,21 +8417,21 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8667,21 +8667,21 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8917,21 +8917,21 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -9167,21 +9167,21 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_release_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -9417,21 +9417,21 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -9667,21 +9667,21 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -9856,17 +9856,17 @@ define amdgpu_kernel void @local_system_one_as_unordered_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_one_as_unordered_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: ds_load_b32 v1, v0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_one_as_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("one-as") unordered, align 4 @@ -10039,17 +10039,17 @@ define amdgpu_kernel void @local_system_one_as_monotonic_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_one_as_monotonic_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: ds_load_b32 v1, v0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_one_as_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("one-as") monotonic, align 4 @@ -10222,17 +10222,17 @@ define amdgpu_kernel void @local_system_one_as_acquire_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_one_as_acquire_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: ds_load_b32 v1, v0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_one_as_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("one-as") acquire, align 4 @@ -10405,17 +10405,17 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_one_as_seq_cst_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: ds_load_b32 v1, v0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_one_as_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("one-as") seq_cst, align 4 @@ -10558,15 +10558,15 @@ define amdgpu_kernel void @local_system_one_as_unordered_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_one_as_unordered_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_one_as_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("one-as") unordered, align 4 @@ -10708,15 +10708,15 @@ define amdgpu_kernel void @local_system_one_as_monotonic_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_one_as_monotonic_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_one_as_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("one-as") monotonic, align 4 @@ -10858,15 +10858,15 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_one_as_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_one_as_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("one-as") release, align 4 @@ -11008,15 +11008,15 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_one_as_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_one_as_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("one-as") seq_cst, align 4 @@ -11158,15 +11158,15 @@ define amdgpu_kernel void @local_system_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_one_as_monotonic_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_one_as_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("one-as") monotonic @@ -11308,15 +11308,15 @@ define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_one_as_acquire_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_one_as_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("one-as") acquire @@ -11458,15 +11458,15 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_one_as_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_one_as_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("one-as") release @@ -11608,15 +11608,15 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_one_as_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_one_as_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("one-as") acq_rel @@ -11758,15 +11758,15 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_one_as_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_one_as_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("one-as") seq_cst @@ -11950,18 +11950,18 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_one_as_acquire_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_one_as_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("one-as") acquire @@ -12146,18 +12146,18 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("one-as") acq_rel @@ -12342,18 +12342,18 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("one-as") seq_cst @@ -12522,17 +12522,17 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12701,17 +12701,17 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12880,17 +12880,17 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_one_as_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_one_as_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13059,17 +13059,17 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13238,17 +13238,17 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13417,17 +13417,17 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13596,17 +13596,17 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_one_as_acquire_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_one_as_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13775,17 +13775,17 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_one_as_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_one_as_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13954,17 +13954,17 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14133,17 +14133,17 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14312,17 +14312,17 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14491,17 +14491,17 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14670,17 +14670,17 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_one_as_release_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_one_as_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14849,17 +14849,17 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15028,17 +15028,17 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15249,20 +15249,20 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15475,20 +15475,20 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15701,20 +15701,20 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15927,20 +15927,20 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16153,20 +16153,20 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16379,20 +16379,20 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16605,20 +16605,20 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16831,20 +16831,20 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -17057,20 +17057,20 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -17283,20 +17283,20 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -17509,20 +17509,20 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -17735,20 +17735,20 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -17961,20 +17961,20 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -18187,20 +18187,20 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -18413,20 +18413,20 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll index 88cba0bddf5d7..75e28f9008e28 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll @@ -8,7 +8,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @local_volatile_load_0( ; GFX6-LABEL: local_volatile_load_0: @@ -143,17 +143,17 @@ define amdgpu_kernel void @local_volatile_load_0( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_volatile_load_0: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: ds_load_b32 v1, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_volatile_load_0: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ds_load_b32 v1, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(1) %out) { entry: %val = load volatile i32, ptr addrspace(3) %in, align 4 @@ -322,21 +322,21 @@ define amdgpu_kernel void @local_volatile_load_1( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_volatile_load_1: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, v0 -; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-CU-NEXT: s_mov_b32 s2, 0x3ff -; GFX1250-CU-NEXT: v_and_b32_e64 v1, v1, s2 -; GFX1250-CU-NEXT: s_mov_b32 s2, 2 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_lshl_add_u32 v1, v1, s2, s3 -; GFX1250-CU-NEXT: ds_load_b32 v1, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_volatile_load_1: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_mov_b32 s2, 0x3ff +; GFX1250-NEXT: v_and_b32_e64 v1, v1, s2 +; GFX1250-NEXT: s_mov_b32 s2, 2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshl_add_u32 v1, v1, s2, s3 +; GFX1250-NEXT: ds_load_b32 v1, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(1) %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -459,17 +459,17 @@ define amdgpu_kernel void @local_volatile_store_0( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_volatile_store_0: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_volatile_store_0: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(3) %out) { entry: %val = load i32, ptr addrspace(1) %in, align 4 @@ -612,21 +612,21 @@ define amdgpu_kernel void @local_volatile_store_1( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_volatile_store_1: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX1250-CU-NEXT: s_wait_xcnt 0x0 -; GFX1250-CU-NEXT: s_mov_b32 s1, 0x3ff -; GFX1250-CU-NEXT: v_and_b32_e64 v0, v0, s1 -; GFX1250-CU-NEXT: s_mov_b32 s1, 2 -; GFX1250-CU-NEXT: v_lshl_add_u32 v0, v0, s1, s2 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_volatile_store_1: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s1, 0x3ff +; GFX1250-NEXT: v_and_b32_e64 v0, v0, s1 +; GFX1250-NEXT: s_mov_b32 s1, 2 +; GFX1250-NEXT: v_lshl_add_u32 v0, v0, s1, s2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(3) %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -756,17 +756,17 @@ define amdgpu_kernel void @local_volatile_workgroup_acquire_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_volatile_workgroup_acquire_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: ds_load_b32 v1, v0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_volatile_workgroup_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic volatile i32, ptr addrspace(3) %in syncscope("workgroup") acquire, align 4 @@ -883,16 +883,16 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_volatile_workgroup_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_volatile_workgroup_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic volatile i32 %in, ptr addrspace(3) %out syncscope("workgroup") release, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll index b8ad75049aff8..7e345ed6e2716 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll @@ -12,7 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @local_wavefront_unordered_load( ; GFX6-LABEL: local_wavefront_unordered_load: @@ -179,17 +179,17 @@ define amdgpu_kernel void @local_wavefront_unordered_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_unordered_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: ds_load_b32 v1, v0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("wavefront") unordered, align 4 @@ -362,17 +362,17 @@ define amdgpu_kernel void @local_wavefront_monotonic_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_monotonic_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: ds_load_b32 v1, v0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("wavefront") monotonic, align 4 @@ -545,17 +545,17 @@ define amdgpu_kernel void @local_wavefront_acquire_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_acquire_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: ds_load_b32 v1, v0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("wavefront") acquire, align 4 @@ -728,17 +728,17 @@ define amdgpu_kernel void @local_wavefront_seq_cst_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_seq_cst_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: ds_load_b32 v1, v0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("wavefront") seq_cst, align 4 @@ -881,15 +881,15 @@ define amdgpu_kernel void @local_wavefront_unordered_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_unordered_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("wavefront") unordered, align 4 @@ -1031,15 +1031,15 @@ define amdgpu_kernel void @local_wavefront_monotonic_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_monotonic_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("wavefront") monotonic, align 4 @@ -1181,15 +1181,15 @@ define amdgpu_kernel void @local_wavefront_release_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("wavefront") release, align 4 @@ -1331,15 +1331,15 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("wavefront") seq_cst, align 4 @@ -1481,15 +1481,15 @@ define amdgpu_kernel void @local_wavefront_monotonic_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_monotonic_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront") monotonic @@ -1631,15 +1631,15 @@ define amdgpu_kernel void @local_wavefront_acquire_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_acquire_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront") acquire @@ -1781,15 +1781,15 @@ define amdgpu_kernel void @local_wavefront_release_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront") release @@ -1931,15 +1931,15 @@ define amdgpu_kernel void @local_wavefront_acq_rel_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront") acq_rel @@ -2081,15 +2081,15 @@ define amdgpu_kernel void @local_wavefront_seq_cst_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront") seq_cst @@ -2273,18 +2273,18 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_acquire_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront") acquire @@ -2469,18 +2469,18 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront") acq_rel @@ -2665,18 +2665,18 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront") seq_cst @@ -2845,17 +2845,17 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_monotonic_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3024,17 +3024,17 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_acquire_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3203,17 +3203,17 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3382,17 +3382,17 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3561,17 +3561,17 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3740,17 +3740,17 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_monotonic_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3919,17 +3919,17 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_acquire_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4098,17 +4098,17 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4277,17 +4277,17 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4456,17 +4456,17 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4635,17 +4635,17 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4814,17 +4814,17 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4993,17 +4993,17 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_release_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5172,17 +5172,17 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5351,17 +5351,17 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5572,20 +5572,20 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5798,20 +5798,20 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6024,20 +6024,20 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_release_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6250,20 +6250,20 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6476,20 +6476,20 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6702,20 +6702,20 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6928,20 +6928,20 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7154,20 +7154,20 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7380,20 +7380,20 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7606,20 +7606,20 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7832,20 +7832,20 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8058,20 +8058,20 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8284,20 +8284,20 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8510,20 +8510,20 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8736,20 +8736,20 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8924,17 +8924,17 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_one_as_unordered_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: ds_load_b32 v1, v0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_one_as_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("wavefront-one-as") unordered, align 4 @@ -9107,17 +9107,17 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_one_as_monotonic_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: ds_load_b32 v1, v0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_one_as_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("wavefront-one-as") monotonic, align 4 @@ -9290,17 +9290,17 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_one_as_acquire_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: ds_load_b32 v1, v0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_one_as_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("wavefront-one-as") acquire, align 4 @@ -9473,17 +9473,17 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_one_as_seq_cst_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: ds_load_b32 v1, v0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_one_as_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("wavefront-one-as") seq_cst, align 4 @@ -9626,15 +9626,15 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_one_as_unordered_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_one_as_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("wavefront-one-as") unordered, align 4 @@ -9776,15 +9776,15 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_one_as_monotonic_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_one_as_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("wavefront-one-as") monotonic, align 4 @@ -9926,15 +9926,15 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_one_as_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_one_as_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("wavefront-one-as") release, align 4 @@ -10076,15 +10076,15 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_one_as_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_one_as_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("wavefront-one-as") seq_cst, align 4 @@ -10226,15 +10226,15 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_one_as_monotonic_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_one_as_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront-one-as") monotonic @@ -10376,15 +10376,15 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_one_as_acquire_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_one_as_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront-one-as") acquire @@ -10526,15 +10526,15 @@ define amdgpu_kernel void @local_wavefront_one_as_release_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_one_as_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_one_as_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront-one-as") release @@ -10676,15 +10676,15 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront-one-as") acq_rel @@ -10826,15 +10826,15 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront-one-as") seq_cst @@ -11018,18 +11018,18 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront-one-as") acquire @@ -11214,18 +11214,18 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront-one-as") acq_rel @@ -11410,18 +11410,18 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront-one-as") seq_cst @@ -11590,17 +11590,17 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -11769,17 +11769,17 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -11948,17 +11948,17 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12127,17 +12127,17 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12306,17 +12306,17 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12485,17 +12485,17 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12664,17 +12664,17 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12843,17 +12843,17 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13022,17 +13022,17 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13201,17 +13201,17 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13380,17 +13380,17 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13559,17 +13559,17 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13738,17 +13738,17 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13917,17 +13917,17 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14096,17 +14096,17 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14317,20 +14317,20 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_ret_cmpxch ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14543,20 +14543,20 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14769,20 +14769,20 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14995,20 +14995,20 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15221,20 +15221,20 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15447,20 +15447,20 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15673,20 +15673,20 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15899,20 +15899,20 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16125,20 +16125,20 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16351,20 +16351,20 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16577,20 +16577,20 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16803,20 +16803,20 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -17029,20 +17029,20 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -17255,20 +17255,20 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -17481,20 +17481,20 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll index 94f5aab1eb67d..6aaf9d323b1fd 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll @@ -12,7 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @local_workgroup_unordered_load( ; GFX6-LABEL: local_workgroup_unordered_load: @@ -179,17 +179,17 @@ define amdgpu_kernel void @local_workgroup_unordered_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_unordered_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: ds_load_b32 v1, v0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup") unordered, align 4 @@ -362,17 +362,17 @@ define amdgpu_kernel void @local_workgroup_monotonic_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_monotonic_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: ds_load_b32 v1, v0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup") monotonic, align 4 @@ -550,17 +550,17 @@ define amdgpu_kernel void @local_workgroup_acquire_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_acquire_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: ds_load_b32 v1, v0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup") acquire, align 4 @@ -756,18 +756,18 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_seq_cst_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_load_b32 v1, v0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup") seq_cst, align 4 @@ -910,15 +910,15 @@ define amdgpu_kernel void @local_workgroup_unordered_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_unordered_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup") unordered, align 4 @@ -1060,15 +1060,15 @@ define amdgpu_kernel void @local_workgroup_monotonic_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_monotonic_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup") monotonic, align 4 @@ -1228,16 +1228,16 @@ define amdgpu_kernel void @local_workgroup_release_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup") release, align 4 @@ -1397,16 +1397,16 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup") seq_cst, align 4 @@ -1548,15 +1548,15 @@ define amdgpu_kernel void @local_workgroup_monotonic_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_monotonic_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") monotonic @@ -1714,16 +1714,16 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_acquire_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") acquire @@ -1883,16 +1883,16 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") release @@ -2068,17 +2068,17 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") acq_rel @@ -2254,17 +2254,17 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") seq_cst @@ -2453,18 +2453,18 @@ define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_acquire_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") acquire @@ -2672,19 +2672,19 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") acq_rel @@ -2892,19 +2892,19 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") seq_cst @@ -3073,17 +3073,17 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_monotonic_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3268,18 +3268,18 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_acquire_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3466,18 +3466,18 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3680,19 +3680,19 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3895,19 +3895,19 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4092,18 +4092,18 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_monotonic_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4288,18 +4288,18 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_acquire_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4502,19 +4502,19 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4717,19 +4717,19 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4932,19 +4932,19 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5147,19 +5147,19 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5362,19 +5362,19 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5577,19 +5577,19 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_release_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5792,19 +5792,19 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6007,19 +6007,19 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6230,20 +6230,20 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6461,20 +6461,20 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6705,21 +6705,21 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_release_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6955,21 +6955,21 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7205,21 +7205,21 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7437,20 +7437,20 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7668,20 +7668,20 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7917,21 +7917,21 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8167,21 +8167,21 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8417,21 +8417,21 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8667,21 +8667,21 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8917,21 +8917,21 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -9167,21 +9167,21 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -9417,21 +9417,21 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -9667,21 +9667,21 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -9856,17 +9856,17 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_unordered_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: ds_load_b32 v1, v0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup-one-as") unordered, align 4 @@ -10039,17 +10039,17 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_monotonic_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: ds_load_b32 v1, v0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup-one-as") monotonic, align 4 @@ -10222,17 +10222,17 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_acquire_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: ds_load_b32 v1, v0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup-one-as") acquire, align 4 @@ -10405,17 +10405,17 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_seq_cst_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: ds_load_b32 v1, v0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup-one-as") seq_cst, align 4 @@ -10558,15 +10558,15 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_unordered_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup-one-as") unordered, align 4 @@ -10708,15 +10708,15 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_monotonic_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup-one-as") monotonic, align 4 @@ -10858,15 +10858,15 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup-one-as") release, align 4 @@ -11008,15 +11008,15 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup-one-as") seq_cst, align 4 @@ -11158,15 +11158,15 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_monotonic_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup-one-as") monotonic @@ -11308,15 +11308,15 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_acquire_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup-one-as") acquire @@ -11458,15 +11458,15 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup-one-as") release @@ -11608,15 +11608,15 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup-one-as") acq_rel @@ -11758,15 +11758,15 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup-one-as") seq_cst @@ -11950,18 +11950,18 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup-one-as") acquire @@ -12146,18 +12146,18 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup-one-as") acq_rel @@ -12342,18 +12342,18 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup-one-as") seq_cst @@ -12522,17 +12522,17 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12701,17 +12701,17 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12880,17 +12880,17 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13059,17 +13059,17 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13238,17 +13238,17 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13417,17 +13417,17 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13596,17 +13596,17 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13775,17 +13775,17 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13954,17 +13954,17 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14133,17 +14133,17 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14312,17 +14312,17 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14491,17 +14491,17 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14670,17 +14670,17 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14849,17 +14849,17 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15028,17 +15028,17 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15249,20 +15249,20 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_ret_cmpxch ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15475,20 +15475,20 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15701,20 +15701,20 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15927,20 +15927,20 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16153,20 +16153,20 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16379,20 +16379,20 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16605,20 +16605,20 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16831,20 +16831,20 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -17057,20 +17057,20 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -17283,20 +17283,20 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -17509,20 +17509,20 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -17735,20 +17735,20 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -17961,20 +17961,20 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -18187,20 +18187,20 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -18413,20 +18413,20 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4