From f5d7761e7a17a7f3fdfed0b21b78fcc17703ff13 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Fri, 15 Aug 2025 13:10:32 -0400 Subject: [PATCH] [AMDGPU] Add support for store to constant address space Since we don't stores to the constant address space as IR verifier errors, we need to support their lowering. This PR supports that by treating such stores as no-ops: in the combiner, the store node is simply replaced with its chain. Fixes SWDEV-499366. --- llvm/lib/Target/AMDGPU/AMDGPUInstructions.td | 4 +- .../CodeGen/AMDGPU/store-to-constant-error.ll | 10 - llvm/test/CodeGen/AMDGPU/store-to-constant.ll | 186 ++++++++++++++++++ 3 files changed, 189 insertions(+), 11 deletions(-) delete mode 100644 llvm/test/CodeGen/AMDGPU/store-to-constant-error.ll create mode 100644 llvm/test/CodeGen/AMDGPU/store-to-constant.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index 511fc6967da31..402f5765fba47 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -450,7 +450,9 @@ def LoadAddress_constant : AddressSpaceList<[ AddrSpaces.Constant, def LoadAddress_global : AddressSpaceList<[ AddrSpaces.Global, AddrSpaces.Constant, AddrSpaces.Constant32Bit ]>; -def StoreAddress_global : AddressSpaceList<[ AddrSpaces.Global ]>; +def StoreAddress_global : AddressSpaceList<[ AddrSpaces.Global, + AddrSpaces.Constant, + AddrSpaces.Constant32Bit ]>; def LoadAddress_flat : AddressSpaceList<[ AddrSpaces.Flat, AddrSpaces.Global, diff --git a/llvm/test/CodeGen/AMDGPU/store-to-constant-error.ll b/llvm/test/CodeGen/AMDGPU/store-to-constant-error.ll deleted file mode 100644 index 0bfc45c84b0c4..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/store-to-constant-error.ll +++ /dev/null @@ -1,10 +0,0 @@ -; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o /dev/null %s 2>&1 | FileCheck -check-prefix=SDAG %s -; RUN: not llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o /dev/null %s 2>&1 | FileCheck -check-prefix=GISEL %s - -; SDAG: LLVM ERROR: Cannot select: {{[a-z0-9]+}}: ch = store<(store (s32) into %ir.ptr.load, addrspace 4)> -; GISEL: LLVM ERROR: cannot select: G_STORE %{{[0-9]+}}:vgpr(s32), %{{[0-9]+}}:vgpr(p4) :: (store (s32) into %ir.ptr.load, addrspace 4) (in function: store_to_constant_i32) -define amdgpu_kernel void @store_to_constant_i32(ptr addrspace(4) %ptr) { -bb: - store i32 1, ptr addrspace(4) %ptr, align 4 - ret void -} diff --git a/llvm/test/CodeGen/AMDGPU/store-to-constant.ll b/llvm/test/CodeGen/AMDGPU/store-to-constant.ll new file mode 100644 index 0000000000000..64d5d01454a37 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/store-to-constant.ll @@ -0,0 +1,186 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a %s -o - | FileCheck %s + +; FIXME: We need to test AS6 but the AS6 variants of the following tests fail because of illegal VGPR to SGPR copy. +; FIXME: We also want to test memset, memcpy, and memmove, but it needs to fix the SelectionDAG store merging issue (#90714). + +define amdgpu_kernel void @store_as4_i8(ptr addrspace(4) %p, i8 %v) { +; CHECK-LABEL: store_as4_i8: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dword s2, s[8:9], 0x8 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s2 +; CHECK-NEXT: global_store_byte v0, v1, s[0:1] +; CHECK-NEXT: s_endpgm + store i8 %v, ptr addrspace(4) %p + ret void +} + +define amdgpu_kernel void @store_as4_i16(ptr addrspace(4) %p, i16 %v) { +; CHECK-LABEL: store_as4_i16: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dword s2, s[8:9], 0x8 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s2 +; CHECK-NEXT: global_store_short v0, v1, s[0:1] +; CHECK-NEXT: s_endpgm + store i16 %v, ptr addrspace(4) %p + ret void +} + +define amdgpu_kernel void @store_as4_i32(ptr addrspace(4) %p, i32 %v) { +; CHECK-LABEL: store_as4_i32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dword s2, s[8:9], 0x8 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s2 +; CHECK-NEXT: global_store_dword v0, v1, s[0:1] +; CHECK-NEXT: s_endpgm + store i32 %v, ptr addrspace(4) %p + ret void +} + +define amdgpu_kernel void @store_as4_i64(ptr addrspace(4) %p, i64 %v) { +; CHECK-LABEL: store_as4_i64: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-NEXT: v_mov_b32_e32 v3, s3 +; CHECK-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1] +; CHECK-NEXT: s_endpgm + store i64 %v, ptr addrspace(4) %p + ret void +} + +define amdgpu_kernel void @store_as4_float(ptr addrspace(4) %p, float %v) { +; CHECK-LABEL: store_as4_float: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dword s2, s[8:9], 0x8 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s2 +; CHECK-NEXT: global_store_dword v0, v1, s[0:1] +; CHECK-NEXT: s_endpgm + store float %v, ptr addrspace(4) %p + ret void +} + +define amdgpu_kernel void @store_as4_double(ptr addrspace(4) %p, double %v) { +; CHECK-LABEL: store_as4_double: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-NEXT: v_mov_b32_e32 v3, s3 +; CHECK-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1] +; CHECK-NEXT: s_endpgm + store double %v, ptr addrspace(4) %p + ret void +} + +define amdgpu_kernel void @store_as4_half(ptr addrspace(4) %p, half %v) { +; CHECK-LABEL: store_as4_half: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dword s2, s[8:9], 0x8 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s2 +; CHECK-NEXT: global_store_short v0, v1, s[0:1] +; CHECK-NEXT: s_endpgm + store half %v, ptr addrspace(4) %p + ret void +} + +define amdgpu_kernel void @store_as4_2xi8(ptr addrspace(4) %p, <2 x i8> %v) { +; CHECK-LABEL: store_as4_2xi8: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dword s2, s[8:9], 0x8 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s2 +; CHECK-NEXT: global_store_short v0, v1, s[0:1] +; CHECK-NEXT: s_endpgm + store <2 x i8> %v, ptr addrspace(4) %p + ret void +} + +define amdgpu_kernel void @store_as4_2xi16(ptr addrspace(4) %p, <2 x i16> %v) { +; CHECK-LABEL: store_as4_2xi16: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dword s2, s[8:9], 0x8 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s2 +; CHECK-NEXT: global_store_dword v0, v1, s[0:1] +; CHECK-NEXT: s_endpgm + store <2 x i16> %v, ptr addrspace(4) %p + ret void +} + +define amdgpu_kernel void @store_as4_2xi32(ptr addrspace(4) %p, <2 x i32> %v) { +; CHECK-LABEL: store_as4_2xi32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; CHECK-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; CHECK-NEXT: s_endpgm + store <2 x i32> %v, ptr addrspace(4) %p + ret void +} + +define amdgpu_kernel void @store_as4_2xhalf(ptr addrspace(4) %p, <2 x half> %v) { +; CHECK-LABEL: store_as4_2xhalf: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dword s2, s[8:9], 0x8 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s2 +; CHECK-NEXT: global_store_dword v0, v1, s[0:1] +; CHECK-NEXT: s_endpgm + store <2 x half> %v, ptr addrspace(4) %p + ret void +} + +define amdgpu_kernel void @store_as4_2xfloat(ptr addrspace(4) %p, <2 x float> %v) { +; CHECK-LABEL: store_as4_2xfloat: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; CHECK-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; CHECK-NEXT: s_endpgm + store <2 x float> %v, ptr addrspace(4) %p + ret void +} + +define amdgpu_kernel void @store_as4_2xdouble(ptr addrspace(4) %p, <2 x double> %v) { +; CHECK-LABEL: store_as4_2xdouble: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 +; CHECK-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v4, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; CHECK-NEXT: s_endpgm + store <2 x double> %v, ptr addrspace(4) %p + ret void +}