diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index 203790ed95153..9fce41259af6b 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -536,52 +536,49 @@ struct LDSBarrierOpLowering : public ConvertOpToLLVMPattern { LogicalResult matchAndRewrite(LDSBarrierOp op, LDSBarrierOp::Adaptor adaptor, ConversionPatternRewriter &rewriter) const override { - bool requiresInlineAsm = chipset < kGfx90a || chipset.majorVersion == 11; - + Location loc = op.getLoc(); + // This ensures that waits on global memory aren't introduced on + // chips that don't have the BackOffBarrier feature enabled in LLVM. + bool requiresInlineAsm = chipset < kGfx90a; + + Attribute mmra = + rewriter.getAttr("amdgpu-synchronize-as", "local"); + // Note: while there *is* a workgroup-one-as scope, this, when combined with + // the MMRA, will lead to the fence having no effect. This is because the + // codepaths for an atomic load or store will observe that a + // one-address-space atomic to LDS requires no synchronization because + // operations on LDS are totally ordered with respect to each other, and so + // will not emit the correct waitcnt operations that these fences are + // intended to produce. Therefore, we use a broader type of fence and rely + // on the MMRA to relax it to the semantics we want. + StringRef scope = "workgroup"; + + auto relFence = LLVM::FenceOp::create(rewriter, loc, + LLVM::AtomicOrdering::release, scope); + relFence->setDiscardableAttr(LLVM::LLVMDialect::getMmraAttrName(), mmra); if (requiresInlineAsm) { auto asmDialectAttr = LLVM::AsmDialectAttr::get(rewriter.getContext(), LLVM::AsmDialect::AD_ATT); - const char *asmStr = - ";;;WARNING: BREAKS DEBUG WATCHES\ns_waitcnt lgkmcnt(0)\ns_barrier"; + const char *asmStr = ";;;WARNING: BREAKS DEBUG WATCHES\ns_barrier"; const char *constraints = ""; - rewriter.replaceOpWithNewOp( - op, + LLVM::InlineAsmOp::create( + rewriter, loc, /*resultTypes=*/TypeRange(), /*operands=*/ValueRange(), /*asm_string=*/asmStr, constraints, /*has_side_effects=*/true, /*is_align_stack=*/false, LLVM::TailCallKind::None, /*asm_dialect=*/asmDialectAttr, /*operand_attrs=*/ArrayAttr()); - return success(); - } - if (chipset.majorVersion < 12) { - constexpr int32_t ldsOnlyBitsGfx6789 = ~(0x1f << 8); - constexpr int32_t ldsOnlyBitsGfx10 = ~(0x3f << 8); - // Left in place in case someone disables the inline ASM path or future - // chipsets use the same bit pattern. - constexpr int32_t ldsOnlyBitsGfx11 = ~(0x3f << 4); - - int32_t ldsOnlyBits; - if (chipset.majorVersion == 11) - ldsOnlyBits = ldsOnlyBitsGfx11; - else if (chipset.majorVersion == 10) - ldsOnlyBits = ldsOnlyBitsGfx10; - else if (chipset.majorVersion <= 9) - ldsOnlyBits = ldsOnlyBitsGfx6789; - else - return op.emitOpError( - "don't know how to lower this for chipset major version") - << chipset.majorVersion; - - Location loc = op->getLoc(); - ROCDL::SWaitcntOp::create(rewriter, loc, ldsOnlyBits); - rewriter.replaceOpWithNewOp(op); + } else if (chipset.majorVersion < 12) { + ROCDL::SBarrierOp::create(rewriter, loc); } else { - Location loc = op->getLoc(); - ROCDL::WaitDscntOp::create(rewriter, loc, 0); ROCDL::BarrierSignalOp::create(rewriter, loc, -1); - rewriter.replaceOpWithNewOp(op, -1); + ROCDL::BarrierWaitOp::create(rewriter, loc, -1); } + auto acqFence = LLVM::FenceOp::create(rewriter, loc, + LLVM::AtomicOrdering::acquire, scope); + acqFence->setDiscardableAttr(LLVM::LLVMDialect::getMmraAttrName(), mmra); + rewriter.replaceOp(op, acqFence); return success(); } }; diff --git a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir index cc1162d8b0de8..5dd1046cce041 100644 --- a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir @@ -8,6 +8,8 @@ // Note: #gpu.address_space is hardcoded to `1` here because the // test pass doesn't set up the GPU address space conversions. +// CHECK: #[[$MMRA_TAG:.+]] = #llvm.mmra_tag<"amdgpu-synchronize-as":"local"> + #gpu_global_addrspace = 1 // CHECK-LABEL: func @fat_raw_buffer_cast @@ -414,19 +416,16 @@ func.func @amdgpu_raw_buffer_atomic_cmpswap_v2f16(%src : vector<2xf16>, %cmp : v // CHECK-LABEL: func @lds_barrier func.func @lds_barrier() { + // CHECK: llvm.fence syncscope("workgroup") release {llvm.mmra = #[[$MMRA_TAG]]} // GFX908: llvm.inline_asm has_side_effects asm_dialect = att - // GFX908-SAME: ";;;WARNING: BREAKS DEBUG WATCHES\0As_waitcnt lgkmcnt(0)\0As_barrier" - // GFX90A: rocdl.s.waitcnt -7937 + // GFX908-SAME: ";;;WARNING: BREAKS DEBUG WATCHES\0As_barrier" // GFX90A-NEXT: rocdl.s.barrier - // GFX942: rocdl.s.waitcnt -7937 // GFX942-NEXT: rocdl.s.barrier - // GFX10: rocdl.s.waitcnt -16129 // GFX10-NEXT: rocdl.s.barrier - // GFX11: llvm.inline_asm has_side_effects asm_dialect = att - // GFX11-SAME: ";;;WARNING: BREAKS DEBUG WATCHES\0As_waitcnt lgkmcnt(0)\0As_barrier" - // GFX12: rocdl.s.wait.dscnt 0 + // GFX11-NEXT: rocdl.s.barrier // GFX12-NEXT: rocdl.s.barrier.signal -1 // GFX12-NEXT: rocdl.s.barrier.wait -1 + // CHECK-NEXT: llvm.fence syncscope("workgroup") acquire {llvm.mmra = #[[$MMRA_TAG]]} amdgpu.lds_barrier func.return }