diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 8e35ba77d69aa..ae82ef577a18a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -260,6 +260,7 @@ class AMDGPUCodeGenPrepareImpl bool visitIntrinsicInst(IntrinsicInst &I); bool visitFMinLike(IntrinsicInst &I); bool visitSqrt(IntrinsicInst &I); + bool visitBufferIntrinsic(IntrinsicInst &I); bool run(); }; @@ -1910,6 +1911,15 @@ bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) { return visitFMinLike(I); case Intrinsic::sqrt: return visitSqrt(I); + case Intrinsic::amdgcn_raw_buffer_load: + case Intrinsic::amdgcn_raw_buffer_load_format: + case Intrinsic::amdgcn_raw_buffer_store: + case Intrinsic::amdgcn_raw_buffer_store_format: + case Intrinsic::amdgcn_raw_ptr_buffer_load: + case Intrinsic::amdgcn_raw_ptr_buffer_load_format: + case Intrinsic::amdgcn_raw_ptr_buffer_store: + case Intrinsic::amdgcn_raw_ptr_buffer_store_format: + return visitBufferIntrinsic(I); default: return false; } @@ -2046,6 +2056,75 @@ bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) { return true; } +/// Sink uniform addends in buffer address calculations into soffset. +/// +/// Transforms buffer loads/stores with voffset = add(uniform, divergent) +/// into voffset = divergent, soffset = uniform for better address coalescing +/// Only applies to raw buffer operations with soffset initially zero. +bool AMDGPUCodeGenPrepareImpl::visitBufferIntrinsic(IntrinsicInst &I) { + Intrinsic::ID IID = I.getIntrinsicID(); + bool IsLoad = (IID == Intrinsic::amdgcn_raw_buffer_load || + IID == Intrinsic::amdgcn_raw_buffer_load_format || + IID == Intrinsic::amdgcn_raw_ptr_buffer_load || + IID == Intrinsic::amdgcn_raw_ptr_buffer_load_format); + bool IsStore = (IID == Intrinsic::amdgcn_raw_buffer_store || + IID == Intrinsic::amdgcn_raw_buffer_store_format || + IID == Intrinsic::amdgcn_raw_ptr_buffer_store || + IID == Intrinsic::amdgcn_raw_ptr_buffer_store_format); + + if (!IsLoad && !IsStore) + return false; + + // Buffer intrinsic operand layout (same for vector and pointer descriptor): + // Load: (rsrc, voffset, soffset, cachepolicy) + // Store: (vdata, rsrc, voffset, soffset, cachepolicy) + const unsigned VOffsetIdx = IsStore ? 2 : 1; + const unsigned SOffsetIdx = IsStore ? 3 : 2; + + Value *VOffset = I.getArgOperand(VOffsetIdx); + Value *SOffset = I.getArgOperand(SOffsetIdx); + + // Only optimize when soffset is currently zero + if (!match(SOffset, m_Zero())) + return false; + + // Pattern match: voffset = add(uniform, divergent) + Value *LHS, *RHS; + if (!match(VOffset, m_Add(m_Value(LHS), m_Value(RHS)))) + return false; + + bool LHSUniform = UA.isUniform(LHS); + bool RHSUniform = UA.isUniform(RHS); + + // Need exactly one uniform and one divergent operand. + // TODO: Handle the case where both are uniform. + if (LHSUniform == RHSUniform) + return false; + + Value *UniformAddend = LHSUniform ? LHS : RHS; + Value *DivergentAddend = LHSUniform ? RHS : LHS; + + // Skip if the uniform addend is a non-negative constant that fits in the + // 12-bit immediate offset field. The backend will fold it into the immediate + // field, which avoids consuming an soffset operand. + // Negative or large constants must use soffset. + if (auto *CI = dyn_cast(UniformAddend)) { + int64_t Offset = CI->getSExtValue(); + if (Offset >= 0 && Offset <= 4095) + return false; + } + + LLVM_DEBUG(dbgs() << "AMDGPUCodeGenPrepare: Sinking uniform addend into " + "soffset for buffer " + << (IsStore ? "store" : "load") << ": " << I << '\n'); + + // Update voffset and soffset operands + I.setArgOperand(VOffsetIdx, DivergentAddend); + I.setArgOperand(SOffsetIdx, UniformAddend); + + return true; +} + bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) { if (skipFunction(F)) return false; diff --git a/llvm/test/CodeGen/AMDGPU/buffer-offset-to-soffset-loads.ll b/llvm/test/CodeGen/AMDGPU/buffer-offset-to-soffset-loads.ll new file mode 100644 index 0000000000000..c520062c5a01e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/buffer-offset-to-soffset-loads.ll @@ -0,0 +1,457 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck -check-prefixes=CHECK %s + +; Test comprehensive patterns for ADD(divergent, uniform) optimization in buffer loads + +; Basic workitem.id.x + uniform +define amdgpu_kernel void @test_basic_workitem_uniform(ptr addrspace(1) %output, i32 %soffset) { +; CHECK-LABEL: test_basic_workitem_uniform: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x2c +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_load_dword v0, v0, s[0:3], s6 offen +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: global_store_dword v1, v0, s[0:1] +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %sum = add i32 %voffset, %soffset + %val = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %desc, i32 %sum, i32 0, i32 0) + store i32 %val, ptr addrspace(1) %output + ret void +} + +; Reversed operands (uniform + divergent) +define amdgpu_kernel void @test_reversed_operands(ptr addrspace(1) %output, i32 %soffset) { +; CHECK-LABEL: test_reversed_operands: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x2c +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_load_dword v0, v0, s[0:3], s6 offen +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: global_store_dword v1, v0, s[0:1] +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %sum = add i32 %soffset, %voffset ; Reversed: uniform + divergent + %val = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %desc, i32 %sum, i32 0, i32 0) + store i32 %val, ptr addrspace(1) %output + ret void +} + +; Multiple buffer loads with same pattern +define amdgpu_kernel void @test_multiple_loads(ptr addrspace(1) %output, i32 %soffset1, i32 %soffset2) { +; CHECK-LABEL: test_multiple_loads: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_load_dword v1, v0, s[4:7], s2 offen +; CHECK-NEXT: buffer_load_dword v2, v0, s[4:7], s3 offen +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_add_u32_e32 v1, v1, v2 +; CHECK-NEXT: global_store_dword v0, v1, s[0:1] +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + + %sum1 = add i32 %voffset, %soffset1 + %val1 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %desc, i32 %sum1, i32 0, i32 0) + + %sum2 = add i32 %voffset, %soffset2 + %val2 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %desc, i32 %sum2, i32 0, i32 0) + + %result = add i32 %val1, %val2 + store i32 %result, ptr addrspace(1) %output + ret void +} + +; Different buffer load variants - byte load +define amdgpu_kernel void @test_buffer_load_byte(ptr addrspace(1) %output, i32 %soffset) { +; CHECK-LABEL: test_buffer_load_byte: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x2c +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_load_ubyte v0, v0, s[0:3], s6 offen +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: global_store_dword v1, v0, s[0:1] +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %sum = add i32 %voffset, %soffset + %val = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %desc, i32 %sum, i32 0, i32 0) + %ext = zext i8 %val to i32 + store i32 %ext, ptr addrspace(1) %output + ret void +} + +; Different buffer load variants - short load +define amdgpu_kernel void @test_buffer_load_short(ptr addrspace(1) %output, i32 %soffset) { +; CHECK-LABEL: test_buffer_load_short: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x2c +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_load_ushort v0, v0, s[0:3], s6 offen +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: global_store_dword v1, v0, s[0:1] +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %sum = add i32 %voffset, %soffset + %val = call i16 @llvm.amdgcn.raw.buffer.load.i16(<4 x i32> %desc, i32 %sum, i32 0, i32 0) + %ext = zext i16 %val to i32 + store i32 %ext, ptr addrspace(1) %output + ret void +} + +; Vector loads - v2i32 +define amdgpu_kernel void @test_buffer_load_v2i32(ptr addrspace(1) %output, i32 %soffset) { +; CHECK-LABEL: test_buffer_load_v2i32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x2c +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_load_dwordx2 v[0:1], v0, s[0:3], s6 offen +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %sum = add i32 %voffset, %soffset + %val = call <2 x i32> @llvm.amdgcn.raw.buffer.load.v2i32(<4 x i32> %desc, i32 %sum, i32 0, i32 0) + store <2 x i32> %val, ptr addrspace(1) %output + ret void +} + +; Vector loads - v4i32 +define amdgpu_kernel void @test_buffer_load_v4i32(ptr addrspace(1) %output, i32 %soffset) { +; CHECK-LABEL: test_buffer_load_v4i32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x2c +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_mov_b32_e32 v4, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], s6 offen +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %sum = add i32 %voffset, %soffset + %val = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> %desc, i32 %sum, i32 0, i32 0) + store <4 x i32> %val, ptr addrspace(1) %output + ret void +} + +; Float loads +define amdgpu_kernel void @test_buffer_load_float(ptr addrspace(1) %output, i32 %soffset) { +; CHECK-LABEL: test_buffer_load_float: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x2c +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_load_dword v0, v0, s[0:3], s6 offen +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: global_store_dword v1, v0, s[0:1] +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %sum = add i32 %voffset, %soffset + %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %desc, i32 %sum, i32 0, i32 0) + store float %val, ptr addrspace(1) %output + ret void +} + +; Complex divergent expression + uniform +define amdgpu_kernel void @test_complex_divergent(ptr addrspace(1) %output, i32 %soffset) { +; CHECK-LABEL: test_complex_divergent: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x2c +; CHECK-NEXT: v_add_u32_e32 v0, v0, v1 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_load_dword v0, v0, s[0:3], s6 offen +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: global_store_dword v1, v0, s[0:1] +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %tid_x = call i32 @llvm.amdgcn.workitem.id.x() + %tid_y = call i32 @llvm.amdgcn.workitem.id.y() + %divergent = add i32 %tid_x, %tid_y ; Still divergent + %sum = add i32 %divergent, %soffset ; divergent + uniform + %val = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %desc, i32 %sum, i32 0, i32 0) + store i32 %val, ptr addrspace(1) %output + ret void +} + +; Should NOT optimize - both operands divergent +define amdgpu_kernel void @test_both_divergent(ptr addrspace(1) %output) { +; CHECK-LABEL: test_both_divergent: +; CHECK: ; %bb.0: +; CHECK-NEXT: v_add_u32_e32 v0, v0, v1 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: global_store_dword v1, v0, s[0:1] +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %tid_x = call i32 @llvm.amdgcn.workitem.id.x() + %tid_y = call i32 @llvm.amdgcn.workitem.id.y() + %sum = add i32 %tid_x, %tid_y + %val = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %desc, i32 %sum, i32 0, i32 0) + store i32 %val, ptr addrspace(1) %output + ret void +} + +; Should NOT optimize - both operands uniform +define amdgpu_kernel void @test_both_uniform(ptr addrspace(1) %output, i32 %soffset1, i32 %soffset2) { +; CHECK-LABEL: test_both_uniform: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_add_i32 s2, s2, s3 +; CHECK-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dword v1, v0, s[0:1] +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %sum = add i32 %soffset1, %soffset2 + %val = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %desc, i32 %sum, i32 0, i32 0) + store i32 %val, ptr addrspace(1) %output + ret void +} + +; Nested in control flow +define amdgpu_kernel void @test_control_flow(ptr addrspace(1) %output, i32 %soffset, i32 %condition) { +; CHECK-LABEL: test_control_flow: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_cmp_lg_u32 s3, 0 +; CHECK-NEXT: s_cbranch_scc0 .LBB11_4 +; CHECK-NEXT: ; %bb.1: ; %else +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: global_store_dword v1, v1, s[0:1] +; CHECK-NEXT: s_cbranch_execnz .LBB11_3 +; CHECK-NEXT: .LBB11_2: ; %then +; CHECK-NEXT: buffer_load_dword v0, v0, s[4:7], s2 offen +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dword v1, v0, s[0:1] +; CHECK-NEXT: .LBB11_3: ; %end +; CHECK-NEXT: s_endpgm +; CHECK-NEXT: .LBB11_4: +; CHECK-NEXT: s_branch .LBB11_2 + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %cmp = icmp eq i32 %condition, 0 + br i1 %cmp, label %then, label %else + +then: + %sum = add i32 %voffset, %soffset + %val = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %desc, i32 %sum, i32 0, i32 0) + store i32 %val, ptr addrspace(1) %output + br label %end + +else: + store i32 0, ptr addrspace(1) %output + br label %end + +end: + ret void +} + +; Multiple uses of the ADD result - should still optimize buffer load +define amdgpu_kernel void @test_multiple_uses(ptr addrspace(1) %output1, ptr addrspace(1) %output2, i32 %soffset) { +; CHECK-LABEL: test_multiple_uses: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x34 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_load_dword v1, v0, s[0:3], s6 offen +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; CHECK-NEXT: v_add_u32_e32 v0, s6, v0 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: global_store_dword v2, v1, s[0:1] +; CHECK-NEXT: global_store_dword v2, v0, s[2:3] +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %sum = add i32 %voffset, %soffset + %val = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %desc, i32 %sum, i32 0, i32 0) + store i32 %val, ptr addrspace(1) %output1 + store i32 %sum, ptr addrspace(1) %output2 + ret void +} + +; Chain of operations - workitem.id -> mul -> add -> buffer_load +define amdgpu_kernel void @test_operation_chain(ptr addrspace(1) %output, i32 %soffset) { +; CHECK-LABEL: test_operation_chain: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x2c +; CHECK-NEXT: v_mul_u32_u24_e32 v0, 4, v0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_load_dword v0, v0, s[0:3], s6 offen +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: global_store_dword v1, v0, s[0:1] +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %scaled = mul i32 %tid, 4 ; Still divergent + %sum = add i32 %scaled, %soffset ; divergent + uniform + %val = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %desc, i32 %sum, i32 0, i32 0) + store i32 %val, ptr addrspace(1) %output + ret void +} + +; Should NOT optimize - Buffer load with non-zero soffset field already +define amdgpu_kernel void @test_existing_soffset(ptr addrspace(1) %output, i32 %soffset) { +; CHECK-LABEL: test_existing_soffset: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x2c +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_add_u32_e32 v0, s6, v0 +; CHECK-NEXT: s_movk_i32 s6, 0x64 +; CHECK-NEXT: buffer_load_dword v0, v0, s[0:3], s6 offen +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: global_store_dword v1, v0, s[0:1] +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %sum = add i32 %voffset, %soffset + %val = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %desc, i32 %sum, i32 100, i32 0) ; Non-zero soffset + store i32 %val, ptr addrspace(1) %output + ret void +} + +; Should NOT optimize - Structured buffer loads +define amdgpu_kernel void @test_struct_buffer_load(ptr addrspace(1) %output, i32 %soffset) { +; CHECK-LABEL: test_struct_buffer_load: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x2c +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_add_u32_e32 v0, s6, v0 +; CHECK-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: global_store_dword v1, v0, s[0:1] +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %sum = add i32 %voffset, %soffset + %val = call i32 @llvm.amdgcn.struct.buffer.load.i32(<4 x i32> %desc, i32 %sum, i32 0, i32 0, i32 0) + store i32 %val, ptr addrspace(1) %output + ret void +} + +; Should NOT optimize - small positive constant fits in immediate offset field +define amdgpu_kernel void @test_small_positive_constant(ptr addrspace(1) %output) { +; CHECK-LABEL: test_small_positive_constant: +; CHECK: ; %bb.0: +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:100 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: global_store_dword v1, v0, s[0:1] +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %sum = add i32 %voffset, 100 + %val = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %desc, i32 %sum, i32 0, i32 0) + store i32 %val, ptr addrspace(1) %output + ret void +} + +; Should optimize - negative constant must use soffset +define amdgpu_kernel void @test_negative_constant(ptr addrspace(1) %output) { +; CHECK-LABEL: test_negative_constant: +; CHECK: ; %bb.0: +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: buffer_load_dword v0, v0, s[0:3], -16 offen +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: global_store_dword v1, v0, s[0:1] +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %sum = add i32 %voffset, -16 + %val = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %desc, i32 %sum, i32 0, i32 0) + store i32 %val, ptr addrspace(1) %output + ret void +} + +; Should optimize - large constant doesn't fit in immediate offset field +define amdgpu_kernel void @test_large_constant(ptr addrspace(1) %output) { +; CHECK-LABEL: test_large_constant: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_movk_i32 s6, 0x1388 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: buffer_load_dword v0, v0, s[0:3], s6 offen +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: global_store_dword v1, v0, s[0:1] +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %sum = add i32 %voffset, 5000 + %val = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %desc, i32 %sum, i32 0, i32 0) + store i32 %val, ptr addrspace(1) %output + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/buffer-offset-to-soffset-stores.ll b/llvm/test/CodeGen/AMDGPU/buffer-offset-to-soffset-stores.ll new file mode 100644 index 0000000000000..a180d29f20e10 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/buffer-offset-to-soffset-stores.ll @@ -0,0 +1,461 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck -check-prefixes=CHECK %s + +; Test comprehensive patterns for ADD(divergent, uniform) optimization in buffer stores + +; Basic workitem.id.x + uniform for store +define amdgpu_kernel void @test_basic_workitem_uniform_store(ptr addrspace(1) %input, i32 %soffset) { +; CHECK-LABEL: test_basic_workitem_uniform_store: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_load_dword s4, s[4:5], 0x2c +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_dword v1, v1, s[0:1] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], s4 offen +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %sum = add i32 %voffset, %soffset + %val = load i32, ptr addrspace(1) %input + call void @llvm.amdgcn.raw.buffer.store.i32(i32 %val, <4 x i32> %desc, i32 %sum, i32 0, i32 0) + ret void +} + +; Reversed operands (uniform + divergent) for store +define amdgpu_kernel void @test_reversed_operands_store(ptr addrspace(1) %input, i32 %soffset) { +; CHECK-LABEL: test_reversed_operands_store: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_load_dword s4, s[4:5], 0x2c +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_dword v1, v1, s[0:1] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], s4 offen +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %sum = add i32 %soffset, %voffset ; Reversed: uniform + divergent + %val = load i32, ptr addrspace(1) %input + call void @llvm.amdgcn.raw.buffer.store.i32(i32 %val, <4 x i32> %desc, i32 %sum, i32 0, i32 0) + ret void +} + +; Multiple buffer stores with same pattern +define amdgpu_kernel void @test_multiple_stores(ptr addrspace(1) %input, i32 %soffset1, i32 %soffset2) { +; CHECK-LABEL: test_multiple_stores: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_dword v1, v1, s[0:1] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_dword v1, v0, s[4:7], s2 offen +; CHECK-NEXT: v_add_u32_e32 v1, 10, v1 +; CHECK-NEXT: buffer_store_dword v1, v0, s[4:7], s3 offen +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %val = load i32, ptr addrspace(1) %input + + %sum1 = add i32 %voffset, %soffset1 + call void @llvm.amdgcn.raw.buffer.store.i32(i32 %val, <4 x i32> %desc, i32 %sum1, i32 0, i32 0) + + %sum2 = add i32 %voffset, %soffset2 + %val2 = add i32 %val, 10 + call void @llvm.amdgcn.raw.buffer.store.i32(i32 %val2, <4 x i32> %desc, i32 %sum2, i32 0, i32 0) + + ret void +} + +; Different buffer store variants - byte store +define amdgpu_kernel void @test_buffer_store_byte(ptr addrspace(1) %input, i32 %soffset) { +; CHECK-LABEL: test_buffer_store_byte: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_load_dword s4, s[4:5], 0x2c +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_dword v1, v1, s[0:1] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], s4 offen +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %sum = add i32 %voffset, %soffset + %val = load i32, ptr addrspace(1) %input + %trunc = trunc i32 %val to i8 + call void @llvm.amdgcn.raw.buffer.store.i8(i8 %trunc, <4 x i32> %desc, i32 %sum, i32 0, i32 0) + ret void +} + +; Different buffer store variants - short store +define amdgpu_kernel void @test_buffer_store_short(ptr addrspace(1) %input, i32 %soffset) { +; CHECK-LABEL: test_buffer_store_short: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_load_dword s4, s[4:5], 0x2c +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_dword v1, v1, s[0:1] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], s4 offen +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %sum = add i32 %voffset, %soffset + %val = load i32, ptr addrspace(1) %input + %trunc = trunc i32 %val to i16 + call void @llvm.amdgcn.raw.buffer.store.i16(i16 %trunc, <4 x i32> %desc, i32 %sum, i32 0, i32 0) + ret void +} + +; Vector stores - v2i32 +define amdgpu_kernel void @test_buffer_store_v2i32(ptr addrspace(1) %input, i32 %soffset) { +; CHECK-LABEL: test_buffer_store_v2i32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_load_dword s4, s[4:5], 0x2c +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_dwordx2 v[1:2], v1, s[0:1] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_dwordx2 v[1:2], v0, s[0:3], s4 offen +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %sum = add i32 %voffset, %soffset + %val = load <2 x i32>, ptr addrspace(1) %input + call void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32> %val, <4 x i32> %desc, i32 %sum, i32 0, i32 0) + ret void +} + +; Vector stores - v4i32 +define amdgpu_kernel void @test_buffer_store_v4i32(ptr addrspace(1) %input, i32 %soffset) { +; CHECK-LABEL: test_buffer_store_v4i32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_load_dword s4, s[4:5], 0x2c +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_dwordx4 v[1:4], v1, s[0:1] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_dwordx4 v[1:4], v0, s[0:3], s4 offen +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %sum = add i32 %voffset, %soffset + %val = load <4 x i32>, ptr addrspace(1) %input + call void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32> %val, <4 x i32> %desc, i32 %sum, i32 0, i32 0) + ret void +} + +; Float stores +define amdgpu_kernel void @test_buffer_store_float(ptr addrspace(1) %input, i32 %soffset) { +; CHECK-LABEL: test_buffer_store_float: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_load_dword s4, s[4:5], 0x2c +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_dword v1, v1, s[0:1] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], s4 offen +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %sum = add i32 %voffset, %soffset + %val = load float, ptr addrspace(1) %input + call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %desc, i32 %sum, i32 0, i32 0) + ret void +} + +; Complex divergent expression + uniform for store +define amdgpu_kernel void @test_complex_divergent_store(ptr addrspace(1) %input, i32 %soffset) { +; CHECK-LABEL: test_complex_divergent_store: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: s_load_dword s4, s[4:5], 0x2c +; CHECK-NEXT: v_add_u32_e32 v0, v0, v1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_dword v2, v2, s[0:1] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], s4 offen +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %tid_x = call i32 @llvm.amdgcn.workitem.id.x() + %tid_y = call i32 @llvm.amdgcn.workitem.id.y() + %divergent = add i32 %tid_x, %tid_y ; Still divergent + %sum = add i32 %divergent, %soffset ; divergent + uniform + %val = load i32, ptr addrspace(1) %input + call void @llvm.amdgcn.raw.buffer.store.i32(i32 %val, <4 x i32> %desc, i32 %sum, i32 0, i32 0) + ret void +} + +; Should NOT optimize - both operands divergent +define amdgpu_kernel void @test_both_divergent_store(ptr addrspace(1) %input) { +; CHECK-LABEL: test_both_divergent_store: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: v_add_u32_e32 v0, v0, v1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_dword v2, v2, s[0:1] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %tid_x = call i32 @llvm.amdgcn.workitem.id.x() + %tid_y = call i32 @llvm.amdgcn.workitem.id.y() + %sum = add i32 %tid_x, %tid_y + %val = load i32, ptr addrspace(1) %input + call void @llvm.amdgcn.raw.buffer.store.i32(i32 %val, <4 x i32> %desc, i32 %sum, i32 0, i32 0) + ret void +} + +; Should NOT optimize - both operands uniform +define amdgpu_kernel void @test_both_uniform_store(ptr addrspace(1) %input, i32 %soffset1, i32 %soffset2) { +; CHECK-LABEL: test_both_uniform_store: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_dword v0, v0, s[0:1] +; CHECK-NEXT: s_add_i32 s0, s2, s3 +; CHECK-NEXT: v_mov_b32_e32 v1, s0 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %sum = add i32 %soffset1, %soffset2 + %val = load i32, ptr addrspace(1) %input + call void @llvm.amdgcn.raw.buffer.store.i32(i32 %val, <4 x i32> %desc, i32 %sum, i32 0, i32 0) + ret void +} + +; Nested in control flow +define amdgpu_kernel void @test_control_flow_store(ptr addrspace(1) %input, i32 %soffset, i32 %condition) { +; CHECK-LABEL: test_control_flow_store: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_dword v1, v1, s[0:1] +; CHECK-NEXT: s_cmp_lg_u32 s3, 0 +; CHECK-NEXT: s_cbranch_scc0 .LBB11_4 +; CHECK-NEXT: ; %bb.1: ; %else +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen +; CHECK-NEXT: s_cbranch_execnz .LBB11_3 +; CHECK-NEXT: .LBB11_2: ; %then +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_dword v1, v0, s[4:7], s2 offen +; CHECK-NEXT: .LBB11_3: ; %end +; CHECK-NEXT: s_endpgm +; CHECK-NEXT: .LBB11_4: +; CHECK-NEXT: s_branch .LBB11_2 + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %val = load i32, ptr addrspace(1) %input + %cmp = icmp eq i32 %condition, 0 + br i1 %cmp, label %then, label %else + +then: + %sum = add i32 %voffset, %soffset + call void @llvm.amdgcn.raw.buffer.store.i32(i32 %val, <4 x i32> %desc, i32 %sum, i32 0, i32 0) + br label %end + +else: + call void @llvm.amdgcn.raw.buffer.store.i32(i32 %val, <4 x i32> %desc, i32 %voffset, i32 0, i32 0) + br label %end + +end: + ret void +} + +; Multiple uses of the ADD result - should still optimize buffer store +define amdgpu_kernel void @test_multiple_uses_store(ptr addrspace(1) %input, ptr addrspace(1) %output, i32 %soffset) { +; CHECK-LABEL: test_multiple_uses_store: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; CHECK-NEXT: s_load_dword s8, s[4:5], 0x34 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_dword v2, v1, s[0:1] +; CHECK-NEXT: v_add_u32_e32 v3, s8, v0 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_dword v2, v0, s[4:7], s8 offen +; CHECK-NEXT: global_store_dword v1, v3, s[2:3] +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %sum = add i32 %voffset, %soffset + %val = load i32, ptr addrspace(1) %input + call void @llvm.amdgcn.raw.buffer.store.i32(i32 %val, <4 x i32> %desc, i32 %sum, i32 0, i32 0) + store i32 %sum, ptr addrspace(1) %output + ret void +} + +; Chain of operations - workitem.id -> mul -> add -> buffer_store +define amdgpu_kernel void @test_operation_chain_store(ptr addrspace(1) %input, i32 %soffset) { +; CHECK-LABEL: test_operation_chain_store: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_load_dword s4, s[4:5], 0x2c +; CHECK-NEXT: v_mul_u32_u24_e32 v0, 4, v0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_dword v1, v1, s[0:1] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], s4 offen +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %scaled = mul i32 %tid, 4 ; Still divergent + %sum = add i32 %scaled, %soffset ; divergent + uniform + %val = load i32, ptr addrspace(1) %input + call void @llvm.amdgcn.raw.buffer.store.i32(i32 %val, <4 x i32> %desc, i32 %sum, i32 0, i32 0) + ret void +} + +; Should NOT optimize - Buffer store with non-zero soffset field already +define amdgpu_kernel void @test_existing_soffset_store(ptr addrspace(1) %input, i32 %soffset) { +; CHECK-LABEL: test_existing_soffset_store: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_load_dword s4, s[4:5], 0x2c +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_dword v1, v1, s[0:1] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_add_u32_e32 v0, s4, v0 +; CHECK-NEXT: s_movk_i32 s4, 0x64 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], s4 offen +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %sum = add i32 %voffset, %soffset + %val = load i32, ptr addrspace(1) %input + call void @llvm.amdgcn.raw.buffer.store.i32(i32 %val, <4 x i32> %desc, i32 %sum, i32 100, i32 0) ; Non-zero soffset + ret void +} + +; Should NOT optimize - Structured buffer stores +define amdgpu_kernel void @test_struct_buffer_store(ptr addrspace(1) %input, i32 %soffset) { +; CHECK-LABEL: test_struct_buffer_store: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_load_dword s4, s[4:5], 0x2c +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_dword v1, v1, s[0:1] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_add_u32_e32 v0, s4, v0 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %sum = add i32 %voffset, %soffset + %val = load i32, ptr addrspace(1) %input + call void @llvm.amdgcn.struct.buffer.store.i32(i32 %val, <4 x i32> %desc, i32 %sum, i32 0, i32 0, i32 0) + ret void +} + +; Should NOT optimize - small positive constant fits in immediate offset field +define amdgpu_kernel void @test_small_positive_constant_store(ptr addrspace(1) %input) { +; CHECK-LABEL: test_small_positive_constant_store: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_dword v1, v1, s[0:1] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:100 +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %sum = add i32 %voffset, 100 + %val = load i32, ptr addrspace(1) %input + call void @llvm.amdgcn.raw.buffer.store.i32(i32 %val, <4 x i32> %desc, i32 %sum, i32 0, i32 0) + ret void +} + +; Should optimize - negative constant must use soffset +define amdgpu_kernel void @test_negative_constant_store(ptr addrspace(1) %input) { +; CHECK-LABEL: test_negative_constant_store: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_dword v1, v1, s[0:1] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], -16 offen +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %sum = add i32 %voffset, -16 + %val = load i32, ptr addrspace(1) %input + call void @llvm.amdgcn.raw.buffer.store.i32(i32 %val, <4 x i32> %desc, i32 %sum, i32 0, i32 0) + ret void +} + +; Should optimize - large constant doesn't fit in immediate offset field +define amdgpu_kernel void @test_large_constant_store(ptr addrspace(1) %input) { +; CHECK-LABEL: test_large_constant_store: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_movk_i32 s4, 0x1388 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_dword v1, v1, s[0:1] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], s4 offen +; CHECK-NEXT: s_endpgm + %desc = call <4 x i32> asm "", "=s"() + %voffset = call i32 @llvm.amdgcn.workitem.id.x() + %sum = add i32 %voffset, 5000 + %val = load i32, ptr addrspace(1) %input + call void @llvm.amdgcn.raw.buffer.store.i32(i32 %val, <4 x i32> %desc, i32 %sum, i32 0, i32 0) + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll index e6a59f43ad690..0177c50ca80c9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll @@ -435,24 +435,28 @@ main_body: } define amdgpu_ps <4 x float> @buffer_load_negative_offset(<4 x i32> inreg, i32 %ofs) { +; PREGFX10-LABEL: buffer_load_negative_offset: +; PREGFX10: ; %bb.0: ; %main_body +; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], -16 offen +; PREGFX10-NEXT: s_waitcnt vmcnt(0) +; PREGFX10-NEXT: ; return to shader part epilog +; ; GFX10-LABEL: buffer_load_negative_offset: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_add_nc_u32_e32 v0, -16, v0 -; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], -16 offen ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: buffer_load_negative_offset: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_add_nc_u32_e32 v0, -16, v0 -; GFX11-NEXT: buffer_load_b128 v[0:3], v0, s[0:3], 0 offen +; GFX11-NEXT: buffer_load_b128 v[0:3], v0, s[0:3], -16 offen ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; ; GFX12-LABEL: buffer_load_negative_offset: ; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: v_add_nc_u32_e32 v0, -16, v0 -; GFX12-NEXT: buffer_load_b128 v[0:3], v0, s[0:3], null offen +; GFX12-NEXT: s_mov_b32 s{{[0-9]+}}, -16 +; GFX12-NEXT: buffer_load_b128 v[0:3], v0, s[0:3], s{{[0-9]+}} offen ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll index a9799993f5cdc..f07091e77d36f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll @@ -387,17 +387,21 @@ main_body: } define amdgpu_ps <4 x float> @buffer_load_negative_offset(ptr addrspace(8) inreg, i32 %ofs) { +; PREGFX10-LABEL: buffer_load_negative_offset: +; PREGFX10: ; %bb.0: ; %main_body +; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], -16 offen +; PREGFX10-NEXT: s_waitcnt vmcnt(0) +; PREGFX10-NEXT: ; return to shader part epilog +; ; GFX10-LABEL: buffer_load_negative_offset: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_add_nc_u32_e32 v0, -16, v0 -; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], -16 offen ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: buffer_load_negative_offset: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_add_nc_u32_e32 v0, -16, v0 -; GFX11-NEXT: buffer_load_b128 v[0:3], v0, s[0:3], 0 offen +; GFX11-NEXT: buffer_load_b128 v[0:3], v0, s[0:3], -16 offen ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog main_body: