From 244905ccecc22ed725ae3b699a31f25f21f5a7fc Mon Sep 17 00:00:00 2001 From: Prasoon Mishra Date: Fri, 26 Sep 2025 11:14:07 +0000 Subject: [PATCH] [AMDGPU] Sink uniform buffer address offsets into soffset This patch implements an optimization to partition MUBUF load/store offsets into vector and scalar components for better address coalescing and reduced VGPR pressure. Transform buffer operations where voffset = add(uniform, divergent) by moving the uniform part to soffset and keeping the divergent part in voffset. Before: v_add_u32 v1, v0, sN buffer_{load,store}_T v*, v1, s[bufDesc:bufDesc+3] offen After: buffer_{load,store}_T v*, v0, s[bufDesc:bufDesc+3], sN offen The optimization currently applies to raw buffer loads/stores when soffset is initially zero. Tests includes comprehensive validation of both buffer loads and stores across various supported variants (i8, i16, i32, vectors, floats) with positive and negative test cases. --- .../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 79 +++ .../AMDGPU/buffer-offset-to-soffset-loads.ll | 457 +++++++++++++++++ .../AMDGPU/buffer-offset-to-soffset-stores.ll | 461 ++++++++++++++++++ .../AMDGPU/llvm.amdgcn.raw.buffer.load.ll | 16 +- .../AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll | 12 +- 5 files changed, 1015 insertions(+), 10 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/buffer-offset-to-soffset-loads.ll create mode 100644 llvm/test/CodeGen/AMDGPU/buffer-offset-to-soffset-stores.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 8e35ba77d69aa..ae82ef577a18a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -260,6 +260,7 @@ class AMDGPUCodeGenPrepareImpl bool visitIntrinsicInst(IntrinsicInst &I); bool visitFMinLike(IntrinsicInst &I); bool visitSqrt(IntrinsicInst &I); + bool visitBufferIntrinsic(IntrinsicInst &I); bool run(); }; @@ -1910,6 +1911,15 @@ bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) { return visitFMinLike(I); case Intrinsic::sqrt: return visitSqrt(I); + case Intrinsic::amdgcn_raw_buffer_load: + case Intrinsic::amdgcn_raw_buffer_load_format: + case Intrinsic::amdgcn_raw_buffer_store: + case Intrinsic::amdgcn_raw_buffer_store_format: + case Intrinsic::amdgcn_raw_ptr_buffer_load: + case Intrinsic::amdgcn_raw_ptr_buffer_load_format: + case Intrinsic::amdgcn_raw_ptr_buffer_store: + case Intrinsic::amdgcn_raw_ptr_buffer_store_format: + return visitBufferIntrinsic(I); default: return false; } @@ -2046,6 +2056,75 @@ bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) { return true; } +/// Sink uniform addends in buffer address calculations into soffset. +/// +/// Transforms buffer loads/stores with voffset = add(uniform, divergent) +/// into voffset = divergent, soffset = uniform for better address coalescing +/// Only applies to raw buffer operations with soffset initially zero. +bool AMDGPUCodeGenPrepareImpl::visitBufferIntrinsic(IntrinsicInst &I) { + Intrinsic::ID IID = I.getIntrinsicID(); + bool IsLoad = (IID == Intrinsic::amdgcn_raw_buffer_load || + IID == Intrinsic::amdgcn_raw_buffer_load_format || + IID == Intrinsic::amdgcn_raw_ptr_buffer_load || + IID == Intrinsic::amdgcn_raw_ptr_buffer_load_format); + bool IsStore = (IID == Intrinsic::amdgcn_raw_buffer_store || + IID == Intrinsic::amdgcn_raw_buffer_store_format || + IID == Intrinsic::amdgcn_raw_ptr_buffer_store || + IID == Intrinsic::amdgcn_raw_ptr_buffer_store_format); + + if (!IsLoad && !IsStore) + return false; + + // Buffer intrinsic operand layout (same for vector and pointer descriptor): + // Load: (rsrc, voffset, soffset, cachepolicy) + // Store: (vdata, rsrc, voffset, soffset, cachepolicy) + const unsigned VOffsetIdx = IsStore ? 2 : 1; + const unsigned SOffsetIdx = IsStore ? 3 : 2; + + Value *VOffset = I.getArgOperand(VOffsetIdx); + Value *SOffset = I.getArgOperand(SOffsetIdx); + + // Only optimize when soffset is currently zero + if (!match(SOffset, m_Zero())) + return false; + + // Pattern match: voffset = add(uniform, divergent) + Value *LHS, *RHS; + if (!match(VOffset, m_Add(m_Value(LHS), m_Value(RHS)))) + return false; + + bool LHSUniform = UA.isUniform(LHS); + bool RHSUniform = UA.isUniform(RHS); + + // Need exactly one uniform and one divergent operand. + // TODO: Handle the case where both are uniform. + if (LHSUniform == RHSUniform) + return false; + + Value *UniformAddend = LHSUniform ? LHS : RHS; + Value *DivergentAddend = LHSUniform ? RHS : LHS; + + // Skip if the uniform addend is a non-negative constant that fits in the + // 12-bit immediate offset field. The backend will fold it into the immediate + // field, which avoids consuming an soffset operand. + // Negative or large constants must use soffset. + if (auto *CI = dyn_cast(UniformAddend)) { + int64_t Offset = CI->getSExtValue(); + if (Offset >= 0 && Offset <= 4095) + return false; + } + + LLVM_DEBUG(dbgs() << "AMDGPUCodeGenPrepare: Sinking uniform addend into " + "soffset for buffer " + << (IsStore ? "store" : "load") << ": " << I << '\n'); + + // Update voffset and soffset operands + I.setArgOperand(VOffsetIdx, DivergentAddend); + I.setArgOperand(SOffsetIdx, UniformAddend); + + return true; +} + bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) { if (skipFunction(F)) return false; diff --git a/llvm/test/CodeGen/AMDGPU/buffer-offset-to-soffset-loads.ll b/llvm/test/CodeGen/AMDGPU/buffer-offset-to-soffset-loads.ll new file mode 100644 index 0000000000000..c520062c5a01e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/buffer-offset-to-soffset-loads.ll @@ -0,0 +1,457 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck -check-prefixes=CHECK %s + +; Test comprehensive patterns for ADD(divergent, uniform) optimization in buffer loads + +; Basic workitem.id.x + uniform +define amdgpu_kernel void @test_basic_workitem_uniform(ptr addrspace(1) %output, i32 %soffset) { +; CHECK-LABEL: test_basic_workitem_uniform: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x2c +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_load_dword v0, v0, s[0:3], s6 offen +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: global_store_dword v1, v0, s[0:1] +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %sum = add i32 %voffset, %soffset + %val = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %desc, i32 %sum, i32 0, i32 0) + store i32 %val, ptr addrspace(1) %output + ret void +} + +; Reversed operands (uniform + divergent) +define amdgpu_kernel void @test_reversed_operands(ptr addrspace(1) %output, i32 %soffset) { +; CHECK-LABEL: test_reversed_operands: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x2c +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_load_dword v0, v0, s[0:3], s6 offen +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: global_store_dword v1, v0, s[0:1] +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %sum = add i32 %soffset, %voffset ; Reversed: uniform + divergent + %val = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %desc, i32 %sum, i32 0, i32 0) + store i32 %val, ptr addrspace(1) %output + ret void +} + +; Multiple buffer loads with same pattern +define amdgpu_kernel void @test_multiple_loads(ptr addrspace(1) %output, i32 %soffset1, i32 %soffset2) { +; CHECK-LABEL: test_multiple_loads: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_load_dword v1, v0, s[4:7], s2 offen +; CHECK-NEXT: buffer_load_dword v2, v0, s[4:7], s3 offen +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_add_u32_e32 v1, v1, v2 +; CHECK-NEXT: global_store_dword v0, v1, s[0:1] +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + + %sum1 = add i32 %voffset, %soffset1 + %val1 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %desc, i32 %sum1, i32 0, i32 0) + + %sum2 = add i32 %voffset, %soffset2 + %val2 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %desc, i32 %sum2, i32 0, i32 0) + + %result = add i32 %val1, %val2 + store i32 %result, ptr addrspace(1) %output + ret void +} + +; Different buffer load variants - byte load +define amdgpu_kernel void @test_buffer_load_byte(ptr addrspace(1) %output, i32 %soffset) { +; CHECK-LABEL: test_buffer_load_byte: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x2c +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_load_ubyte v0, v0, s[0:3], s6 offen +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: global_store_dword v1, v0, s[0:1] +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %sum = add i32 %voffset, %soffset + %val = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %desc, i32 %sum, i32 0, i32 0) + %ext = zext i8 %val to i32 + store i32 %ext, ptr addrspace(1) %output + ret void +} + +; Different buffer load variants - short load +define amdgpu_kernel void @test_buffer_load_short(ptr addrspace(1) %output, i32 %soffset) { +; CHECK-LABEL: test_buffer_load_short: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x2c +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_load_ushort v0, v0, s[0:3], s6 offen +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: global_store_dword v1, v0, s[0:1] +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %sum = add i32 %voffset, %soffset + %val = call i16 @llvm.amdgcn.raw.buffer.load.i16(<4 x i32> %desc, i32 %sum, i32 0, i32 0) + %ext = zext i16 %val to i32 + store i32 %ext, ptr addrspace(1) %output + ret void +} + +; Vector loads - v2i32 +define amdgpu_kernel void @test_buffer_load_v2i32(ptr addrspace(1) %output, i32 %soffset) { +; CHECK-LABEL: test_buffer_load_v2i32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x2c +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_load_dwordx2 v[0:1], v0, s[0:3], s6 offen +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %sum = add i32 %voffset, %soffset + %val = call <2 x i32> @llvm.amdgcn.raw.buffer.load.v2i32(<4 x i32> %desc, i32 %sum, i32 0, i32 0) + store <2 x i32> %val, ptr addrspace(1) %output + ret void +} + +; Vector loads - v4i32 +define amdgpu_kernel void @test_buffer_load_v4i32(ptr addrspace(1) %output, i32 %soffset) { +; CHECK-LABEL: test_buffer_load_v4i32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x2c +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_mov_b32_e32 v4, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], s6 offen +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %sum = add i32 %voffset, %soffset + %val = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> %desc, i32 %sum, i32 0, i32 0) + store <4 x i32> %val, ptr addrspace(1) %output + ret void +} + +; Float loads +define amdgpu_kernel void @test_buffer_load_float(ptr addrspace(1) %output, i32 %soffset) { +; CHECK-LABEL: test_buffer_load_float: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x2c +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_load_dword v0, v0, s[0:3], s6 offen +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: global_store_dword v1, v0, s[0:1] +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %sum = add i32 %voffset, %soffset + %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %desc, i32 %sum, i32 0, i32 0) + store float %val, ptr addrspace(1) %output + ret void +} + +; Complex divergent expression + uniform +define amdgpu_kernel void @test_complex_divergent(ptr addrspace(1) %output, i32 %soffset) { +; CHECK-LABEL: test_complex_divergent: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x2c +; CHECK-NEXT: v_add_u32_e32 v0, v0, v1 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_load_dword v0, v0, s[0:3], s6 offen +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: global_store_dword v1, v0, s[0:1] +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %tid_x = call i32 @llvm.amdgcn.workitem.id.x() + %tid_y = call i32 @llvm.amdgcn.workitem.id.y() + %divergent = add i32 %tid_x, %tid_y ; Still divergent + %sum = add i32 %divergent, %soffset ; divergent + uniform + %val = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %desc, i32 %sum, i32 0, i32 0) + store i32 %val, ptr addrspace(1) %output + ret void +} + +; Should NOT optimize - both operands divergent +define amdgpu_kernel void @test_both_divergent(ptr addrspace(1) %output) { +; CHECK-LABEL: test_both_divergent: +; CHECK: ; %bb.0: +; CHECK-NEXT: v_add_u32_e32 v0, v0, v1 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: global_store_dword v1, v0, s[0:1] +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %tid_x = call i32 @llvm.amdgcn.workitem.id.x() + %tid_y = call i32 @llvm.amdgcn.workitem.id.y() + %sum = add i32 %tid_x, %tid_y + %val = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %desc, i32 %sum, i32 0, i32 0) + store i32 %val, ptr addrspace(1) %output + ret void +} + +; Should NOT optimize - both operands uniform +define amdgpu_kernel void @test_both_uniform(ptr addrspace(1) %output, i32 %soffset1, i32 %soffset2) { +; CHECK-LABEL: test_both_uniform: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_add_i32 s2, s2, s3 +; CHECK-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dword v1, v0, s[0:1] +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %sum = add i32 %soffset1, %soffset2 + %val = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %desc, i32 %sum, i32 0, i32 0) + store i32 %val, ptr addrspace(1) %output + ret void +} + +; Nested in control flow +define amdgpu_kernel void @test_control_flow(ptr addrspace(1) %output, i32 %soffset, i32 %condition) { +; CHECK-LABEL: test_control_flow: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_cmp_lg_u32 s3, 0 +; CHECK-NEXT: s_cbranch_scc0 .LBB11_4 +; CHECK-NEXT: ; %bb.1: ; %else +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: global_store_dword v1, v1, s[0:1] +; CHECK-NEXT: s_cbranch_execnz .LBB11_3 +; CHECK-NEXT: .LBB11_2: ; %then +; CHECK-NEXT: buffer_load_dword v0, v0, s[4:7], s2 offen +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dword v1, v0, s[0:1] +; CHECK-NEXT: .LBB11_3: ; %end +; CHECK-NEXT: s_endpgm +; CHECK-NEXT: .LBB11_4: +; CHECK-NEXT: s_branch .LBB11_2 + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %cmp = icmp eq i32 %condition, 0 + br i1 %cmp, label %then, label %else + +then: + %sum = add i32 %voffset, %soffset + %val = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %desc, i32 %sum, i32 0, i32 0) + store i32 %val, ptr addrspace(1) %output + br label %end + +else: + store i32 0, ptr addrspace(1) %output + br label %end + +end: + ret void +} + +; Multiple uses of the ADD result - should still optimize buffer load +define amdgpu_kernel void @test_multiple_uses(ptr addrspace(1) %output1, ptr addrspace(1) %output2, i32 %soffset) { +; CHECK-LABEL: test_multiple_uses: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x34 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_load_dword v1, v0, s[0:3], s6 offen +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; CHECK-NEXT: v_add_u32_e32 v0, s6, v0 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: global_store_dword v2, v1, s[0:1] +; CHECK-NEXT: global_store_dword v2, v0, s[2:3] +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %sum = add i32 %voffset, %soffset + %val = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %desc, i32 %sum, i32 0, i32 0) + store i32 %val, ptr addrspace(1) %output1 + store i32 %sum, ptr addrspace(1) %output2 + ret void +} + +; Chain of operations - workitem.id -> mul -> add -> buffer_load +define amdgpu_kernel void @test_operation_chain(ptr addrspace(1) %output, i32 %soffset) { +; CHECK-LABEL: test_operation_chain: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x2c +; CHECK-NEXT: v_mul_u32_u24_e32 v0, 4, v0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_load_dword v0, v0, s[0:3], s6 offen +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: global_store_dword v1, v0, s[0:1] +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %scaled = mul i32 %tid, 4 ; Still divergent + %sum = add i32 %scaled, %soffset ; divergent + uniform + %val = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %desc, i32 %sum, i32 0, i32 0) + store i32 %val, ptr addrspace(1) %output + ret void +} + +; Should NOT optimize - Buffer load with non-zero soffset field already +define amdgpu_kernel void @test_existing_soffset(ptr addrspace(1) %output, i32 %soffset) { +; CHECK-LABEL: test_existing_soffset: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x2c +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_add_u32_e32 v0, s6, v0 +; CHECK-NEXT: s_movk_i32 s6, 0x64 +; CHECK-NEXT: buffer_load_dword v0, v0, s[0:3], s6 offen +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: global_store_dword v1, v0, s[0:1] +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %sum = add i32 %voffset, %soffset + %val = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %desc, i32 %sum, i32 100, i32 0) ; Non-zero soffset + store i32 %val, ptr addrspace(1) %output + ret void +} + +; Should NOT optimize - Structured buffer loads +define amdgpu_kernel void @test_struct_buffer_load(ptr addrspace(1) %output, i32 %soffset) { +; CHECK-LABEL: test_struct_buffer_load: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x2c +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_add_u32_e32 v0, s6, v0 +; CHECK-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: global_store_dword v1, v0, s[0:1] +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %sum = add i32 %voffset, %soffset + %val = call i32 @llvm.amdgcn.struct.buffer.load.i32(<4 x i32> %desc, i32 %sum, i32 0, i32 0, i32 0) + store i32 %val, ptr addrspace(1) %output + ret void +} + +; Should NOT optimize - small positive constant fits in immediate offset field +define amdgpu_kernel void @test_small_positive_constant(ptr addrspace(1) %output) { +; CHECK-LABEL: test_small_positive_constant: +; CHECK: ; %bb.0: +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:100 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: global_store_dword v1, v0, s[0:1] +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %sum = add i32 %voffset, 100 + %val = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %desc, i32 %sum, i32 0, i32 0) + store i32 %val, ptr addrspace(1) %output + ret void +} + +; Should optimize - negative constant must use soffset +define amdgpu_kernel void @test_negative_constant(ptr addrspace(1) %output) { +; CHECK-LABEL: test_negative_constant: +; CHECK: ; %bb.0: +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: buffer_load_dword v0, v0, s[0:3], -16 offen +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: global_store_dword v1, v0, s[0:1] +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %sum = add i32 %voffset, -16 + %val = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %desc, i32 %sum, i32 0, i32 0) + store i32 %val, ptr addrspace(1) %output + ret void +} + +; Should optimize - large constant doesn't fit in immediate offset field +define amdgpu_kernel void @test_large_constant(ptr addrspace(1) %output) { +; CHECK-LABEL: test_large_constant: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_movk_i32 s6, 0x1388 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: buffer_load_dword v0, v0, s[0:3], s6 offen +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: global_store_dword v1, v0, s[0:1] +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %sum = add i32 %voffset, 5000 + %val = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %desc, i32 %sum, i32 0, i32 0) + store i32 %val, ptr addrspace(1) %output + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/buffer-offset-to-soffset-stores.ll b/llvm/test/CodeGen/AMDGPU/buffer-offset-to-soffset-stores.ll new file mode 100644 index 0000000000000..a180d29f20e10 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/buffer-offset-to-soffset-stores.ll @@ -0,0 +1,461 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck -check-prefixes=CHECK %s + +; Test comprehensive patterns for ADD(divergent, uniform) optimization in buffer stores + +; Basic workitem.id.x + uniform for store +define amdgpu_kernel void @test_basic_workitem_uniform_store(ptr addrspace(1) %input, i32 %soffset) { +; CHECK-LABEL: test_basic_workitem_uniform_store: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_load_dword s4, s[4:5], 0x2c +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_dword v1, v1, s[0:1] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], s4 offen +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %sum = add i32 %voffset, %soffset + %val = load i32, ptr addrspace(1) %input + call void @llvm.amdgcn.raw.buffer.store.i32(i32 %val, <4 x i32> %desc, i32 %sum, i32 0, i32 0) + ret void +} + +; Reversed operands (uniform + divergent) for store +define amdgpu_kernel void @test_reversed_operands_store(ptr addrspace(1) %input, i32 %soffset) { +; CHECK-LABEL: test_reversed_operands_store: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_load_dword s4, s[4:5], 0x2c +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_dword v1, v1, s[0:1] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], s4 offen +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %sum = add i32 %soffset, %voffset ; Reversed: uniform + divergent + %val = load i32, ptr addrspace(1) %input + call void @llvm.amdgcn.raw.buffer.store.i32(i32 %val, <4 x i32> %desc, i32 %sum, i32 0, i32 0) + ret void +} + +; Multiple buffer stores with same pattern +define amdgpu_kernel void @test_multiple_stores(ptr addrspace(1) %input, i32 %soffset1, i32 %soffset2) { +; CHECK-LABEL: test_multiple_stores: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_dword v1, v1, s[0:1] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_dword v1, v0, s[4:7], s2 offen +; CHECK-NEXT: v_add_u32_e32 v1, 10, v1 +; CHECK-NEXT: buffer_store_dword v1, v0, s[4:7], s3 offen +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %val = load i32, ptr addrspace(1) %input + + %sum1 = add i32 %voffset, %soffset1 + call void @llvm.amdgcn.raw.buffer.store.i32(i32 %val, <4 x i32> %desc, i32 %sum1, i32 0, i32 0) + + %sum2 = add i32 %voffset, %soffset2 + %val2 = add i32 %val, 10 + call void @llvm.amdgcn.raw.buffer.store.i32(i32 %val2, <4 x i32> %desc, i32 %sum2, i32 0, i32 0) + + ret void +} + +; Different buffer store variants - byte store +define amdgpu_kernel void @test_buffer_store_byte(ptr addrspace(1) %input, i32 %soffset) { +; CHECK-LABEL: test_buffer_store_byte: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_load_dword s4, s[4:5], 0x2c +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_dword v1, v1, s[0:1] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], s4 offen +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %sum = add i32 %voffset, %soffset + %val = load i32, ptr addrspace(1) %input + %trunc = trunc i32 %val to i8 + call void @llvm.amdgcn.raw.buffer.store.i8(i8 %trunc, <4 x i32> %desc, i32 %sum, i32 0, i32 0) + ret void +} + +; Different buffer store variants - short store +define amdgpu_kernel void @test_buffer_store_short(ptr addrspace(1) %input, i32 %soffset) { +; CHECK-LABEL: test_buffer_store_short: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_load_dword s4, s[4:5], 0x2c +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_dword v1, v1, s[0:1] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], s4 offen +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %sum = add i32 %voffset, %soffset + %val = load i32, ptr addrspace(1) %input + %trunc = trunc i32 %val to i16 + call void @llvm.amdgcn.raw.buffer.store.i16(i16 %trunc, <4 x i32> %desc, i32 %sum, i32 0, i32 0) + ret void +} + +; Vector stores - v2i32 +define amdgpu_kernel void @test_buffer_store_v2i32(ptr addrspace(1) %input, i32 %soffset) { +; CHECK-LABEL: test_buffer_store_v2i32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_load_dword s4, s[4:5], 0x2c +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_dwordx2 v[1:2], v1, s[0:1] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_dwordx2 v[1:2], v0, s[0:3], s4 offen +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %sum = add i32 %voffset, %soffset + %val = load <2 x i32>, ptr addrspace(1) %input + call void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32> %val, <4 x i32> %desc, i32 %sum, i32 0, i32 0) + ret void +} + +; Vector stores - v4i32 +define amdgpu_kernel void @test_buffer_store_v4i32(ptr addrspace(1) %input, i32 %soffset) { +; CHECK-LABEL: test_buffer_store_v4i32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_load_dword s4, s[4:5], 0x2c +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_dwordx4 v[1:4], v1, s[0:1] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_dwordx4 v[1:4], v0, s[0:3], s4 offen +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %sum = add i32 %voffset, %soffset + %val = load <4 x i32>, ptr addrspace(1) %input + call void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32> %val, <4 x i32> %desc, i32 %sum, i32 0, i32 0) + ret void +} + +; Float stores +define amdgpu_kernel void @test_buffer_store_float(ptr addrspace(1) %input, i32 %soffset) { +; CHECK-LABEL: test_buffer_store_float: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_load_dword s4, s[4:5], 0x2c +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_dword v1, v1, s[0:1] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], s4 offen +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %sum = add i32 %voffset, %soffset + %val = load float, ptr addrspace(1) %input + call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %desc, i32 %sum, i32 0, i32 0) + ret void +} + +; Complex divergent expression + uniform for store +define amdgpu_kernel void @test_complex_divergent_store(ptr addrspace(1) %input, i32 %soffset) { +; CHECK-LABEL: test_complex_divergent_store: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: s_load_dword s4, s[4:5], 0x2c +; CHECK-NEXT: v_add_u32_e32 v0, v0, v1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_dword v2, v2, s[0:1] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], s4 offen +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %tid_x = call i32 @llvm.amdgcn.workitem.id.x() + %tid_y = call i32 @llvm.amdgcn.workitem.id.y() + %divergent = add i32 %tid_x, %tid_y ; Still divergent + %sum = add i32 %divergent, %soffset ; divergent + uniform + %val = load i32, ptr addrspace(1) %input + call void @llvm.amdgcn.raw.buffer.store.i32(i32 %val, <4 x i32> %desc, i32 %sum, i32 0, i32 0) + ret void +} + +; Should NOT optimize - both operands divergent +define amdgpu_kernel void @test_both_divergent_store(ptr addrspace(1) %input) { +; CHECK-LABEL: test_both_divergent_store: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: v_add_u32_e32 v0, v0, v1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_dword v2, v2, s[0:1] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %tid_x = call i32 @llvm.amdgcn.workitem.id.x() + %tid_y = call i32 @llvm.amdgcn.workitem.id.y() + %sum = add i32 %tid_x, %tid_y + %val = load i32, ptr addrspace(1) %input + call void @llvm.amdgcn.raw.buffer.store.i32(i32 %val, <4 x i32> %desc, i32 %sum, i32 0, i32 0) + ret void +} + +; Should NOT optimize - both operands uniform +define amdgpu_kernel void @test_both_uniform_store(ptr addrspace(1) %input, i32 %soffset1, i32 %soffset2) { +; CHECK-LABEL: test_both_uniform_store: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_dword v0, v0, s[0:1] +; CHECK-NEXT: s_add_i32 s0, s2, s3 +; CHECK-NEXT: v_mov_b32_e32 v1, s0 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %sum = add i32 %soffset1, %soffset2 + %val = load i32, ptr addrspace(1) %input + call void @llvm.amdgcn.raw.buffer.store.i32(i32 %val, <4 x i32> %desc, i32 %sum, i32 0, i32 0) + ret void +} + +; Nested in control flow +define amdgpu_kernel void @test_control_flow_store(ptr addrspace(1) %input, i32 %soffset, i32 %condition) { +; CHECK-LABEL: test_control_flow_store: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_dword v1, v1, s[0:1] +; CHECK-NEXT: s_cmp_lg_u32 s3, 0 +; CHECK-NEXT: s_cbranch_scc0 .LBB11_4 +; CHECK-NEXT: ; %bb.1: ; %else +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen +; CHECK-NEXT: s_cbranch_execnz .LBB11_3 +; CHECK-NEXT: .LBB11_2: ; %then +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_dword v1, v0, s[4:7], s2 offen +; CHECK-NEXT: .LBB11_3: ; %end +; CHECK-NEXT: s_endpgm +; CHECK-NEXT: .LBB11_4: +; CHECK-NEXT: s_branch .LBB11_2 + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %val = load i32, ptr addrspace(1) %input + %cmp = icmp eq i32 %condition, 0 + br i1 %cmp, label %then, label %else + +then: + %sum = add i32 %voffset, %soffset + call void @llvm.amdgcn.raw.buffer.store.i32(i32 %val, <4 x i32> %desc, i32 %sum, i32 0, i32 0) + br label %end + +else: + call void @llvm.amdgcn.raw.buffer.store.i32(i32 %val, <4 x i32> %desc, i32 %voffset, i32 0, i32 0) + br label %end + +end: + ret void +} + +; Multiple uses of the ADD result - should still optimize buffer store +define amdgpu_kernel void @test_multiple_uses_store(ptr addrspace(1) %input, ptr addrspace(1) %output, i32 %soffset) { +; CHECK-LABEL: test_multiple_uses_store: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; CHECK-NEXT: s_load_dword s8, s[4:5], 0x34 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_dword v2, v1, s[0:1] +; CHECK-NEXT: v_add_u32_e32 v3, s8, v0 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_dword v2, v0, s[4:7], s8 offen +; CHECK-NEXT: global_store_dword v1, v3, s[2:3] +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %sum = add i32 %voffset, %soffset + %val = load i32, ptr addrspace(1) %input + call void @llvm.amdgcn.raw.buffer.store.i32(i32 %val, <4 x i32> %desc, i32 %sum, i32 0, i32 0) + store i32 %sum, ptr addrspace(1) %output + ret void +} + +; Chain of operations - workitem.id -> mul -> add -> buffer_store +define amdgpu_kernel void @test_operation_chain_store(ptr addrspace(1) %input, i32 %soffset) { +; CHECK-LABEL: test_operation_chain_store: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_load_dword s4, s[4:5], 0x2c +; CHECK-NEXT: v_mul_u32_u24_e32 v0, 4, v0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_dword v1, v1, s[0:1] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], s4 offen +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %scaled = mul i32 %tid, 4 ; Still divergent + %sum = add i32 %scaled, %soffset ; divergent + uniform + %val = load i32, ptr addrspace(1) %input + call void @llvm.amdgcn.raw.buffer.store.i32(i32 %val, <4 x i32> %desc, i32 %sum, i32 0, i32 0) + ret void +} + +; Should NOT optimize - Buffer store with non-zero soffset field already +define amdgpu_kernel void @test_existing_soffset_store(ptr addrspace(1) %input, i32 %soffset) { +; CHECK-LABEL: test_existing_soffset_store: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_load_dword s4, s[4:5], 0x2c +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_dword v1, v1, s[0:1] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_add_u32_e32 v0, s4, v0 +; CHECK-NEXT: s_movk_i32 s4, 0x64 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], s4 offen +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %sum = add i32 %voffset, %soffset + %val = load i32, ptr addrspace(1) %input + call void @llvm.amdgcn.raw.buffer.store.i32(i32 %val, <4 x i32> %desc, i32 %sum, i32 100, i32 0) ; Non-zero soffset + ret void +} + +; Should NOT optimize - Structured buffer stores +define amdgpu_kernel void @test_struct_buffer_store(ptr addrspace(1) %input, i32 %soffset) { +; CHECK-LABEL: test_struct_buffer_store: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_load_dword s4, s[4:5], 0x2c +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_dword v1, v1, s[0:1] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_add_u32_e32 v0, s4, v0 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %sum = add i32 %voffset, %soffset + %val = load i32, ptr addrspace(1) %input + call void @llvm.amdgcn.struct.buffer.store.i32(i32 %val, <4 x i32> %desc, i32 %sum, i32 0, i32 0, i32 0) + ret void +} + +; Should NOT optimize - small positive constant fits in immediate offset field +define amdgpu_kernel void @test_small_positive_constant_store(ptr addrspace(1) %input) { +; CHECK-LABEL: test_small_positive_constant_store: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_dword v1, v1, s[0:1] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:100 +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %sum = add i32 %voffset, 100 + %val = load i32, ptr addrspace(1) %input + call void @llvm.amdgcn.raw.buffer.store.i32(i32 %val, <4 x i32> %desc, i32 %sum, i32 0, i32 0) + ret void +} + +; Should optimize - negative constant must use soffset +define amdgpu_kernel void @test_negative_constant_store(ptr addrspace(1) %input) { +; CHECK-LABEL: test_negative_constant_store: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_dword v1, v1, s[0:1] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], -16 offen +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %sum = add i32 %voffset, -16 + %val = load i32, ptr addrspace(1) %input + call void @llvm.amdgcn.raw.buffer.store.i32(i32 %val, <4 x i32> %desc, i32 %sum, i32 0, i32 0) + ret void +} + +; Should optimize - large constant doesn't fit in immediate offset field +define amdgpu_kernel void @test_large_constant_store(ptr addrspace(1) %input) { +; CHECK-LABEL: test_large_constant_store: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_movk_i32 s4, 0x1388 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_dword v1, v1, s[0:1] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], s4 offen +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %sum = add i32 %voffset, 5000 + %val = load i32, ptr addrspace(1) %input + call void @llvm.amdgcn.raw.buffer.store.i32(i32 %val, <4 x i32> %desc, i32 %sum, i32 0, i32 0) + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll index e6a59f43ad690..0177c50ca80c9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll @@ -435,24 +435,28 @@ main_body: } define amdgpu_ps <4 x float> @buffer_load_negative_offset(<4 x i32> inreg, i32 %ofs) { +; PREGFX10-LABEL: buffer_load_negative_offset: +; PREGFX10: ; %bb.0: ; %main_body +; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], -16 offen +; PREGFX10-NEXT: s_waitcnt vmcnt(0) +; PREGFX10-NEXT: ; return to shader part epilog +; ; GFX10-LABEL: buffer_load_negative_offset: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_add_nc_u32_e32 v0, -16, v0 -; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], -16 offen ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: buffer_load_negative_offset: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_add_nc_u32_e32 v0, -16, v0 -; GFX11-NEXT: buffer_load_b128 v[0:3], v0, s[0:3], 0 offen +; GFX11-NEXT: buffer_load_b128 v[0:3], v0, s[0:3], -16 offen ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; ; GFX12-LABEL: buffer_load_negative_offset: ; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: v_add_nc_u32_e32 v0, -16, v0 -; GFX12-NEXT: buffer_load_b128 v[0:3], v0, s[0:3], null offen +; GFX12-NEXT: s_mov_b32 s{{[0-9]+}}, -16 +; GFX12-NEXT: buffer_load_b128 v[0:3], v0, s[0:3], s{{[0-9]+}} offen ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll index a9799993f5cdc..f07091e77d36f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll @@ -387,17 +387,21 @@ main_body: } define amdgpu_ps <4 x float> @buffer_load_negative_offset(ptr addrspace(8) inreg, i32 %ofs) { +; PREGFX10-LABEL: buffer_load_negative_offset: +; PREGFX10: ; %bb.0: ; %main_body +; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], -16 offen +; PREGFX10-NEXT: s_waitcnt vmcnt(0) +; PREGFX10-NEXT: ; return to shader part epilog +; ; GFX10-LABEL: buffer_load_negative_offset: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_add_nc_u32_e32 v0, -16, v0 -; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], -16 offen ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: buffer_load_negative_offset: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_add_nc_u32_e32 v0, -16, v0 -; GFX11-NEXT: buffer_load_b128 v[0:3], v0, s[0:3], 0 offen +; GFX11-NEXT: buffer_load_b128 v[0:3], v0, s[0:3], -16 offen ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog main_body: