From b414fff24927da4635cea21a69d8714ae11ce9a2 Mon Sep 17 00:00:00 2001 From: Krzysztof Drewniak Date: Wed, 10 Sep 2025 18:15:42 +0000 Subject: [PATCH 1/3] [mlir][AMDGPU] Use LDS-only MMRA fences for lds_barrier The previous lowering strategy for amdgpu.lds_barrier (which is an operation whose semantics are) "s.barrier, and all LDS operations before this happen-before LDS operations after this, and there must not be an inherent fence/forcing-to-completion of global memory (for performance)" was previosuly implemented through using manual calls to waitcnt() intrinsics and the s_barrire intrinsic(s). The lack of explicit fencing enabled miscompiles (where LDS accesses were reordered with the barrier) on gfx12. Since LLVM now allows MMRA annotations to ensure that only LDS accesses are fenced by a pair of fences, we can now use these fences in order to explicitly represent the semantics we want instead of trying to prescribe the method of their implemntation. Note that the gfx908 workaround of hiding the s_barrier in inline assembly in order to prevent spurious vmem barriers remains in place, but is is removed for gfx11 because the fences have been changed to give us the effect we want recently. --- .../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 53 ++++++++----------- .../AMDGPUToROCDL/amdgpu-to-rocdl.mlir | 13 +++-- 2 files changed, 27 insertions(+), 39 deletions(-) diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index 203790ed95153..4d2290934eab1 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -536,52 +536,41 @@ struct LDSBarrierOpLowering : public ConvertOpToLLVMPattern { LogicalResult matchAndRewrite(LDSBarrierOp op, LDSBarrierOp::Adaptor adaptor, ConversionPatternRewriter &rewriter) const override { - bool requiresInlineAsm = chipset < kGfx90a || chipset.majorVersion == 11; + Location loc = op.getLoc(); + // This ensures that waits on global memory aren't introduced on + // chips that don't have the BackOffBarrier feature enabled in LLVM. + bool requiresInlineAsm = chipset < kGfx90a; + + Attribute mmra = + rewriter.getAttr("amdgpu-synchronize-as", "local"); + StringRef scope = "workgroup-one-as"; + auto relFence = LLVM::FenceOp::create(rewriter, loc, + LLVM::AtomicOrdering::release, scope); + relFence->setDiscardableAttr(LLVM::LLVMDialect::getMmraAttrName(), mmra); if (requiresInlineAsm) { auto asmDialectAttr = LLVM::AsmDialectAttr::get(rewriter.getContext(), LLVM::AsmDialect::AD_ATT); - const char *asmStr = - ";;;WARNING: BREAKS DEBUG WATCHES\ns_waitcnt lgkmcnt(0)\ns_barrier"; + const char *asmStr = ";;;WARNING: BREAKS DEBUG WATCHES\ns_barrier"; const char *constraints = ""; - rewriter.replaceOpWithNewOp( - op, + LLVM::InlineAsmOp::create( + rewriter, loc, /*resultTypes=*/TypeRange(), /*operands=*/ValueRange(), /*asm_string=*/asmStr, constraints, /*has_side_effects=*/true, /*is_align_stack=*/false, LLVM::TailCallKind::None, /*asm_dialect=*/asmDialectAttr, /*operand_attrs=*/ArrayAttr()); - return success(); - } - if (chipset.majorVersion < 12) { - constexpr int32_t ldsOnlyBitsGfx6789 = ~(0x1f << 8); - constexpr int32_t ldsOnlyBitsGfx10 = ~(0x3f << 8); - // Left in place in case someone disables the inline ASM path or future - // chipsets use the same bit pattern. - constexpr int32_t ldsOnlyBitsGfx11 = ~(0x3f << 4); - - int32_t ldsOnlyBits; - if (chipset.majorVersion == 11) - ldsOnlyBits = ldsOnlyBitsGfx11; - else if (chipset.majorVersion == 10) - ldsOnlyBits = ldsOnlyBitsGfx10; - else if (chipset.majorVersion <= 9) - ldsOnlyBits = ldsOnlyBitsGfx6789; - else - return op.emitOpError( - "don't know how to lower this for chipset major version") - << chipset.majorVersion; - - Location loc = op->getLoc(); - ROCDL::SWaitcntOp::create(rewriter, loc, ldsOnlyBits); - rewriter.replaceOpWithNewOp(op); + } else if (chipset.majorVersion < 12) { + ROCDL::SBarrierOp::create(rewriter, loc); } else { - Location loc = op->getLoc(); - ROCDL::WaitDscntOp::create(rewriter, loc, 0); ROCDL::BarrierSignalOp::create(rewriter, loc, -1); - rewriter.replaceOpWithNewOp(op, -1); + ROCDL::BarrierWaitOp::create(rewriter, loc, -1); } + auto acqFence = LLVM::FenceOp::create(rewriter, loc, + LLVM::AtomicOrdering::acquire, scope); + acqFence->setDiscardableAttr(LLVM::LLVMDialect::getMmraAttrName(), mmra); + rewriter.replaceOp(op, acqFence); return success(); } }; diff --git a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir index cc1162d8b0de8..91daa75ccb58f 100644 --- a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir @@ -8,6 +8,8 @@ // Note: #gpu.address_space is hardcoded to `1` here because the // test pass doesn't set up the GPU address space conversions. +// CHECK: #[[$MMRA_TAG:.+]] = #llvm.mmra_tag<"amdgpu-synchronize-as":"local"> + #gpu_global_addrspace = 1 // CHECK-LABEL: func @fat_raw_buffer_cast @@ -414,19 +416,16 @@ func.func @amdgpu_raw_buffer_atomic_cmpswap_v2f16(%src : vector<2xf16>, %cmp : v // CHECK-LABEL: func @lds_barrier func.func @lds_barrier() { + // CHECK: llvm.fence syncscope("workgroup-one-as") release {llvm.mmra = #[[$MMRA_TAG]]} // GFX908: llvm.inline_asm has_side_effects asm_dialect = att - // GFX908-SAME: ";;;WARNING: BREAKS DEBUG WATCHES\0As_waitcnt lgkmcnt(0)\0As_barrier" - // GFX90A: rocdl.s.waitcnt -7937 + // GFX908-SAME: ";;;WARNING: BREAKS DEBUG WATCHES\0As_barrier" // GFX90A-NEXT: rocdl.s.barrier - // GFX942: rocdl.s.waitcnt -7937 // GFX942-NEXT: rocdl.s.barrier - // GFX10: rocdl.s.waitcnt -16129 // GFX10-NEXT: rocdl.s.barrier - // GFX11: llvm.inline_asm has_side_effects asm_dialect = att - // GFX11-SAME: ";;;WARNING: BREAKS DEBUG WATCHES\0As_waitcnt lgkmcnt(0)\0As_barrier" - // GFX12: rocdl.s.wait.dscnt 0 + // GFX11-NEXT: rocdl.s.barrier // GFX12-NEXT: rocdl.s.barrier.signal -1 // GFX12-NEXT: rocdl.s.barrier.wait -1 + // CHECK-NEXT: llvm.fence syncscope("workgroup-one-as") acquire {llvm.mmra = #[[$MMRA_TAG]]} amdgpu.lds_barrier func.return } From 9706ccc25d35c88189093829eadb6d5afd786187 Mon Sep 17 00:00:00 2001 From: Krzysztof Drewniak Date: Thu, 11 Sep 2025 19:55:34 +0000 Subject: [PATCH 2/3] Move to a workgroup fence from a workgroup-one-as fence because things don't work how I thought they would --- mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp | 10 +++++++++- .../test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir | 4 ++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index 4d2290934eab1..ea5ef0383b219 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -543,7 +543,15 @@ struct LDSBarrierOpLowering : public ConvertOpToLLVMPattern { Attribute mmra = rewriter.getAttr("amdgpu-synchronize-as", "local"); - StringRef scope = "workgroup-one-as"; + // Note: while there *is* a workgroup-one-as scope, this, when combined with + // the MMRA, will lead to the fence having no effect. This is because + // the codepaths for an atomic load or store will observe that a + // one-address-space atomic to LDS requires no synchronization because + // operations on LDS are totally ordered with respect to each other, + // and so will not emit the correct waitcnt operations that these fences + // are intended to produce. Therefore, we use a broader type of fence + // and rely on the MMRA to relax it to the semantics we want. + StringRef scope = "workgroup"; auto relFence = LLVM::FenceOp::create(rewriter, loc, LLVM::AtomicOrdering::release, scope); diff --git a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir index 91daa75ccb58f..5dd1046cce041 100644 --- a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir @@ -416,7 +416,7 @@ func.func @amdgpu_raw_buffer_atomic_cmpswap_v2f16(%src : vector<2xf16>, %cmp : v // CHECK-LABEL: func @lds_barrier func.func @lds_barrier() { - // CHECK: llvm.fence syncscope("workgroup-one-as") release {llvm.mmra = #[[$MMRA_TAG]]} + // CHECK: llvm.fence syncscope("workgroup") release {llvm.mmra = #[[$MMRA_TAG]]} // GFX908: llvm.inline_asm has_side_effects asm_dialect = att // GFX908-SAME: ";;;WARNING: BREAKS DEBUG WATCHES\0As_barrier" // GFX90A-NEXT: rocdl.s.barrier @@ -425,7 +425,7 @@ func.func @lds_barrier() { // GFX11-NEXT: rocdl.s.barrier // GFX12-NEXT: rocdl.s.barrier.signal -1 // GFX12-NEXT: rocdl.s.barrier.wait -1 - // CHECK-NEXT: llvm.fence syncscope("workgroup-one-as") acquire {llvm.mmra = #[[$MMRA_TAG]]} + // CHECK-NEXT: llvm.fence syncscope("workgroup") acquire {llvm.mmra = #[[$MMRA_TAG]]} amdgpu.lds_barrier func.return } From 6639112187c88cc4a08d72d32736866a2634faeb Mon Sep 17 00:00:00 2001 From: Krzysztof Drewniak Date: Fri, 12 Sep 2025 18:44:26 +0000 Subject: [PATCH 3/3] Rewrap comment --- mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index ea5ef0383b219..9fce41259af6b 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -544,13 +544,13 @@ struct LDSBarrierOpLowering : public ConvertOpToLLVMPattern { Attribute mmra = rewriter.getAttr("amdgpu-synchronize-as", "local"); // Note: while there *is* a workgroup-one-as scope, this, when combined with - // the MMRA, will lead to the fence having no effect. This is because - // the codepaths for an atomic load or store will observe that a + // the MMRA, will lead to the fence having no effect. This is because the + // codepaths for an atomic load or store will observe that a // one-address-space atomic to LDS requires no synchronization because - // operations on LDS are totally ordered with respect to each other, - // and so will not emit the correct waitcnt operations that these fences - // are intended to produce. Therefore, we use a broader type of fence - // and rely on the MMRA to relax it to the semantics we want. + // operations on LDS are totally ordered with respect to each other, and so + // will not emit the correct waitcnt operations that these fences are + // intended to produce. Therefore, we use a broader type of fence and rely + // on the MMRA to relax it to the semantics we want. StringRef scope = "workgroup"; auto relFence = LLVM::FenceOp::create(rewriter, loc,