Skip to content

Conversation

@vangthao95
Copy link
Contributor

No description provided.

@llvmbot
Copy link
Member

llvmbot commented Oct 31, 2025

@llvm/pr-subscribers-backend-amdgpu

Author: None (vangthao95)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/165939.diff

4 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp (+2)
  • (added) llvm/test/CodeGen/AMDGPU/GlobalISel/fence.ll (+388)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll (+7-7)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll (+1-1)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index 103cdec8233a0..fb768c9fd46a9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -913,6 +913,8 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
 
   addRulesForGOpcs({G_ABS}, Standard).Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt}});
 
+  addRulesForGOpcs({G_FENCE}).Any({{{}}, {{}, {}}});
+
   addRulesForGOpcs({G_READSTEADYCOUNTER, G_READCYCLECOUNTER}, Standard)
       .Uni(S64, {{Sgpr64}, {}});
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fence.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fence.ll
new file mode 100644
index 0000000000000..6cdcfe354c69b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fence.ll
@@ -0,0 +1,388 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s
+
+@global_var = addrspace(1) global i32 0, align 4
+
+define amdgpu_kernel void @fence_release(ptr addrspace(1) %ptr, i32 %val1, i32 %val2) {
+; GFX9-LABEL: fence_release:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, global_var@gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, global_var@gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: fence_release:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-NEXT:    s_getpc_b64 s[4:5]
+; GFX11-NEXT:    s_add_u32 s4, s4, global_var@gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s5, s5, global_var@gotpcrel32@hi+12
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v0, v2, s[4:5] dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fence_release:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX12-NEXT:    s_getpc_b64 s[4:5]
+; GFX12-NEXT:    s_sext_i32_i16 s5, s5
+; GFX12-NEXT:    s_add_co_u32 s4, s4, global_var@gotpcrel32@lo+8
+; GFX12-NEXT:    s_add_co_ci_u32 s5, s5, global_var@gotpcrel32@hi+16
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_store_b32 v0, v2, s[4:5] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    s_endpgm
+  store i32 %val1, ptr addrspace(1) %ptr
+  fence release
+  store volatile i32 %val2, ptr addrspace(1) @global_var
+  ret void
+}
+
+define amdgpu_kernel void @fence_acquire(ptr addrspace(1) %ptr) {
+; GFX9-LABEL: fence_acquire:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX9-NEXT:    s_getpc_b64 s[0:1]
+; GFX9-NEXT:    s_add_u32 s0, s0, global_var@gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s1, s1, global_var@gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: fence_acquire:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, global_var@gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, global_var@gotpcrel32@hi+12
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1] dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fence_acquire:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX12-NEXT:    s_getpc_b64 s[0:1]
+; GFX12-NEXT:    s_sext_i32_i16 s1, s1
+; GFX12-NEXT:    s_add_co_u32 s0, s0, global_var@gotpcrel32@lo+8
+; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, global_var@gotpcrel32@hi+16
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    s_endpgm
+  fence acquire
+  %val = load i32, ptr addrspace(1) %ptr
+  store volatile i32 %val, ptr addrspace(1) @global_var
+  ret void
+}
+
+define amdgpu_kernel void @fence_acq_rel(ptr addrspace(1) %ptr, i32 %val) {
+; GFX9-LABEL: fence_acq_rel:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dword s2, s[8:9], 0x8
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX9-NEXT:    s_getpc_b64 s[0:1]
+; GFX9-NEXT:    s_add_u32 s0, s0, global_var@gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s1, s1, global_var@gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: fence_acq_rel:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, global_var@gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, global_var@gotpcrel32@hi+12
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1] dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fence_acq_rel:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-NEXT:    s_getpc_b64 s[0:1]
+; GFX12-NEXT:    s_sext_i32_i16 s1, s1
+; GFX12-NEXT:    s_add_co_u32 s0, s0, global_var@gotpcrel32@lo+8
+; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, global_var@gotpcrel32@hi+16
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    s_endpgm
+  store i32 %val, ptr addrspace(1) %ptr
+  fence acq_rel
+  %load = load i32, ptr addrspace(1) %ptr
+  store volatile i32 %load, ptr addrspace(1) @global_var
+  ret void
+}
+
+define amdgpu_kernel void @fence_seq_cst(ptr addrspace(1) %ptr, i32 %val) {
+; GFX9-LABEL: fence_seq_cst:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dword s2, s[8:9], 0x8
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX9-NEXT:    s_getpc_b64 s[0:1]
+; GFX9-NEXT:    s_add_u32 s0, s0, global_var@gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s1, s1, global_var@gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: fence_seq_cst:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, global_var@gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, global_var@gotpcrel32@hi+12
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1] dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fence_seq_cst:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-NEXT:    s_getpc_b64 s[0:1]
+; GFX12-NEXT:    s_sext_i32_i16 s1, s1
+; GFX12-NEXT:    s_add_co_u32 s0, s0, global_var@gotpcrel32@lo+8
+; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, global_var@gotpcrel32@hi+16
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    s_endpgm
+  store i32 %val, ptr addrspace(1) %ptr
+  fence seq_cst
+  %load = load i32, ptr addrspace(1) %ptr
+  store volatile i32 %load, ptr addrspace(1) @global_var
+  ret void
+}
+
+define amdgpu_kernel void @fence_workgroup_release(ptr addrspace(3) %ptr, i32 %val1, i32 %val2) {
+; GFX9-LABEL: fence_workgroup_release:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, global_var@gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, global_var@gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    ds_write_b32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: fence_workgroup_release:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-NEXT:    s_getpc_b64 s[4:5]
+; GFX11-NEXT:    s_add_u32 s4, s4, global_var@gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s5, s5, global_var@gotpcrel32@hi+12
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-NEXT:    ds_store_b32 v1, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_store_b32 v2, v3, s[4:5] dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fence_workgroup_release:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
+; GFX12-NEXT:    s_getpc_b64 s[4:5]
+; GFX12-NEXT:    s_sext_i32_i16 s5, s5
+; GFX12-NEXT:    s_add_co_u32 s4, s4, global_var@gotpcrel32@lo+8
+; GFX12-NEXT:    s_add_co_ci_u32 s5, s5, global_var@gotpcrel32@hi+16
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-NEXT:    ds_store_b32 v1, v0
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    global_store_b32 v2, v3, s[4:5] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    s_endpgm
+  store i32 %val1, ptr addrspace(3) %ptr
+  fence syncscope("workgroup") release
+  store volatile i32 %val2, ptr addrspace(1) @global_var
+  ret void
+}
+
+define amdgpu_kernel void @fence_agent_seq_cst(ptr addrspace(1) %ptr, i32 %val) {
+; GFX9-LABEL: fence_agent_seq_cst:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dword s2, s[8:9], 0x8
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX9-NEXT:    s_getpc_b64 s[0:1]
+; GFX9-NEXT:    s_add_u32 s0, s0, global_var@gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s1, s1, global_var@gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: fence_agent_seq_cst:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, global_var@gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, global_var@gotpcrel32@hi+12
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1] dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fence_agent_seq_cst:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-NEXT:    s_getpc_b64 s[0:1]
+; GFX12-NEXT:    s_sext_i32_i16 s1, s1
+; GFX12-NEXT:    s_add_co_u32 s0, s0, global_var@gotpcrel32@lo+8
+; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, global_var@gotpcrel32@hi+16
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    s_endpgm
+  store i32 %val, ptr addrspace(1) %ptr
+  fence syncscope("agent") seq_cst
+  %load = load i32, ptr addrspace(1) %ptr
+  store volatile i32 %load, ptr addrspace(1) @global_var
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
index e86f7473363f7..37b5422be7e2f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx600 < %s | FileCheck -check-prefix=GFX6 %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10WGP %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck -check-prefix=GFX10CU %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11WGP %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck -check-prefix=GFX11CU %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx600 < %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10WGP %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck -check-prefix=GFX10CU %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11WGP %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck -check-prefix=GFX11CU %s
 
 ; Note: we use MIR test checks + stop after legalizer to prevent
 ; tests from being optimized out.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll
index 44b12a9f6fe81..61a61376d7ddd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -stop-after=finalize-isel < %s | FileCheck %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 -stop-after=finalize-isel < %s | FileCheck %s
 
 declare void @readsMem(ptr) #0
 declare void @writesMem(ptr) #1

@llvmbot
Copy link
Member

llvmbot commented Oct 31, 2025

@llvm/pr-subscribers-llvm-globalisel

Author: None (vangthao95)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/165939.diff

4 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp (+2)
  • (added) llvm/test/CodeGen/AMDGPU/GlobalISel/fence.ll (+388)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll (+7-7)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll (+1-1)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index 103cdec8233a0..fb768c9fd46a9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -913,6 +913,8 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
 
   addRulesForGOpcs({G_ABS}, Standard).Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt}});
 
+  addRulesForGOpcs({G_FENCE}).Any({{{}}, {{}, {}}});
+
   addRulesForGOpcs({G_READSTEADYCOUNTER, G_READCYCLECOUNTER}, Standard)
       .Uni(S64, {{Sgpr64}, {}});
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fence.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fence.ll
new file mode 100644
index 0000000000000..6cdcfe354c69b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fence.ll
@@ -0,0 +1,388 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s
+
+@global_var = addrspace(1) global i32 0, align 4
+
+define amdgpu_kernel void @fence_release(ptr addrspace(1) %ptr, i32 %val1, i32 %val2) {
+; GFX9-LABEL: fence_release:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, global_var@gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, global_var@gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: fence_release:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-NEXT:    s_getpc_b64 s[4:5]
+; GFX11-NEXT:    s_add_u32 s4, s4, global_var@gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s5, s5, global_var@gotpcrel32@hi+12
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b32 v0, v2, s[4:5] dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fence_release:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX12-NEXT:    s_getpc_b64 s[4:5]
+; GFX12-NEXT:    s_sext_i32_i16 s5, s5
+; GFX12-NEXT:    s_add_co_u32 s4, s4, global_var@gotpcrel32@lo+8
+; GFX12-NEXT:    s_add_co_ci_u32 s5, s5, global_var@gotpcrel32@hi+16
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_store_b32 v0, v2, s[4:5] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    s_endpgm
+  store i32 %val1, ptr addrspace(1) %ptr
+  fence release
+  store volatile i32 %val2, ptr addrspace(1) @global_var
+  ret void
+}
+
+define amdgpu_kernel void @fence_acquire(ptr addrspace(1) %ptr) {
+; GFX9-LABEL: fence_acquire:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX9-NEXT:    s_getpc_b64 s[0:1]
+; GFX9-NEXT:    s_add_u32 s0, s0, global_var@gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s1, s1, global_var@gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: fence_acquire:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, global_var@gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, global_var@gotpcrel32@hi+12
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1] dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fence_acquire:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX12-NEXT:    s_getpc_b64 s[0:1]
+; GFX12-NEXT:    s_sext_i32_i16 s1, s1
+; GFX12-NEXT:    s_add_co_u32 s0, s0, global_var@gotpcrel32@lo+8
+; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, global_var@gotpcrel32@hi+16
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    s_endpgm
+  fence acquire
+  %val = load i32, ptr addrspace(1) %ptr
+  store volatile i32 %val, ptr addrspace(1) @global_var
+  ret void
+}
+
+define amdgpu_kernel void @fence_acq_rel(ptr addrspace(1) %ptr, i32 %val) {
+; GFX9-LABEL: fence_acq_rel:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dword s2, s[8:9], 0x8
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX9-NEXT:    s_getpc_b64 s[0:1]
+; GFX9-NEXT:    s_add_u32 s0, s0, global_var@gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s1, s1, global_var@gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: fence_acq_rel:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, global_var@gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, global_var@gotpcrel32@hi+12
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1] dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fence_acq_rel:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-NEXT:    s_getpc_b64 s[0:1]
+; GFX12-NEXT:    s_sext_i32_i16 s1, s1
+; GFX12-NEXT:    s_add_co_u32 s0, s0, global_var@gotpcrel32@lo+8
+; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, global_var@gotpcrel32@hi+16
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    s_endpgm
+  store i32 %val, ptr addrspace(1) %ptr
+  fence acq_rel
+  %load = load i32, ptr addrspace(1) %ptr
+  store volatile i32 %load, ptr addrspace(1) @global_var
+  ret void
+}
+
+define amdgpu_kernel void @fence_seq_cst(ptr addrspace(1) %ptr, i32 %val) {
+; GFX9-LABEL: fence_seq_cst:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dword s2, s[8:9], 0x8
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX9-NEXT:    s_getpc_b64 s[0:1]
+; GFX9-NEXT:    s_add_u32 s0, s0, global_var@gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s1, s1, global_var@gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: fence_seq_cst:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, global_var@gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, global_var@gotpcrel32@hi+12
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1] dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fence_seq_cst:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-NEXT:    s_getpc_b64 s[0:1]
+; GFX12-NEXT:    s_sext_i32_i16 s1, s1
+; GFX12-NEXT:    s_add_co_u32 s0, s0, global_var@gotpcrel32@lo+8
+; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, global_var@gotpcrel32@hi+16
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    s_endpgm
+  store i32 %val, ptr addrspace(1) %ptr
+  fence seq_cst
+  %load = load i32, ptr addrspace(1) %ptr
+  store volatile i32 %load, ptr addrspace(1) @global_var
+  ret void
+}
+
+define amdgpu_kernel void @fence_workgroup_release(ptr addrspace(3) %ptr, i32 %val1, i32 %val2) {
+; GFX9-LABEL: fence_workgroup_release:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, global_var@gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, global_var@gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    ds_write_b32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: fence_workgroup_release:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-NEXT:    s_getpc_b64 s[4:5]
+; GFX11-NEXT:    s_add_u32 s4, s4, global_var@gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s5, s5, global_var@gotpcrel32@hi+12
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-NEXT:    ds_store_b32 v1, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_store_b32 v2, v3, s[4:5] dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fence_workgroup_release:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
+; GFX12-NEXT:    s_getpc_b64 s[4:5]
+; GFX12-NEXT:    s_sext_i32_i16 s5, s5
+; GFX12-NEXT:    s_add_co_u32 s4, s4, global_var@gotpcrel32@lo+8
+; GFX12-NEXT:    s_add_co_ci_u32 s5, s5, global_var@gotpcrel32@hi+16
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT:    v_mov_b32_e32 v3, s2
+; GFX12-NEXT:    ds_store_b32 v1, v0
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    global_store_b32 v2, v3, s[4:5] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    s_endpgm
+  store i32 %val1, ptr addrspace(3) %ptr
+  fence syncscope("workgroup") release
+  store volatile i32 %val2, ptr addrspace(1) @global_var
+  ret void
+}
+
+define amdgpu_kernel void @fence_agent_seq_cst(ptr addrspace(1) %ptr, i32 %val) {
+; GFX9-LABEL: fence_agent_seq_cst:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dword s2, s[8:9], 0x8
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX9-NEXT:    s_getpc_b64 s[0:1]
+; GFX9-NEXT:    s_add_u32 s0, s0, global_var@gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s1, s1, global_var@gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: fence_agent_seq_cst:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, global_var@gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, global_var@gotpcrel32@hi+12
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1] dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fence_agent_seq_cst:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-NEXT:    s_getpc_b64 s[0:1]
+; GFX12-NEXT:    s_sext_i32_i16 s1, s1
+; GFX12-NEXT:    s_add_co_u32 s0, s0, global_var@gotpcrel32@lo+8
+; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, global_var@gotpcrel32@hi+16
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    s_endpgm
+  store i32 %val, ptr addrspace(1) %ptr
+  fence syncscope("agent") seq_cst
+  %load = load i32, ptr addrspace(1) %ptr
+  store volatile i32 %load, ptr addrspace(1) @global_var
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
index e86f7473363f7..37b5422be7e2f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx600 < %s | FileCheck -check-prefix=GFX6 %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10WGP %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck -check-prefix=GFX10CU %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11WGP %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck -check-prefix=GFX11CU %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx600 < %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10WGP %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck -check-prefix=GFX10CU %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11WGP %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck -check-prefix=GFX11CU %s
 
 ; Note: we use MIR test checks + stop after legalizer to prevent
 ; tests from being optimized out.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll
index 44b12a9f6fe81..61a61376d7ddd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -stop-after=finalize-isel < %s | FileCheck %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 -stop-after=finalize-isel < %s | FileCheck %s
 
 declare void @readsMem(ptr) #0
 declare void @writesMem(ptr) #1

@vangthao95 vangthao95 merged commit e876540 into llvm:main Nov 3, 2025
10 checks passed
@vangthao95 vangthao95 deleted the vthao-globalisel-fence branch November 3, 2025 17:36
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants