Skip to content

Commit 5ecc6d1

Browse files
authored
[mlir][AMDGPU] Use LDS-only MMRA fences for lds_barrier (#157919)
The previous lowering strategy for amdgpu.lds_barrier (which is an operation whose semantics are) "s.barrier, and all LDS operations before this happen-before LDS operations after this, and there must not be an inherent fence/forcing-to-completion of global memory (for performance)" was previosuly implemented through using manual calls to waitcnt() intrinsics and the s_barrire intrinsic(s). The lack of explicit fencing enabled miscompiles (where LDS accesses were reordered with the barrier) on gfx12. Since LLVM now allows MMRA annotations to ensure that only LDS accesses are fenced by a pair of fences, we can now use these fences in order to explicitly represent the semantics we want instead of trying to prescribe the method of their implemntation. Note that the gfx908 workaround of hiding the s_barrier in inline assembly in order to prevent spurious vmem barriers remains in place, but is is removed for gfx11 because the fences have been changed to give us the effect we want recently.
1 parent 8b7a76a commit 5ecc6d1

File tree

2 files changed

+36
-40
lines changed

2 files changed

+36
-40
lines changed

mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp

Lines changed: 30 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -536,52 +536,49 @@ struct LDSBarrierOpLowering : public ConvertOpToLLVMPattern<LDSBarrierOp> {
536536
LogicalResult
537537
matchAndRewrite(LDSBarrierOp op, LDSBarrierOp::Adaptor adaptor,
538538
ConversionPatternRewriter &rewriter) const override {
539-
bool requiresInlineAsm = chipset < kGfx90a || chipset.majorVersion == 11;
540-
539+
Location loc = op.getLoc();
540+
// This ensures that waits on global memory aren't introduced on
541+
// chips that don't have the BackOffBarrier feature enabled in LLVM.
542+
bool requiresInlineAsm = chipset < kGfx90a;
543+
544+
Attribute mmra =
545+
rewriter.getAttr<LLVM::MMRATagAttr>("amdgpu-synchronize-as", "local");
546+
// Note: while there *is* a workgroup-one-as scope, this, when combined with
547+
// the MMRA, will lead to the fence having no effect. This is because the
548+
// codepaths for an atomic load or store will observe that a
549+
// one-address-space atomic to LDS requires no synchronization because
550+
// operations on LDS are totally ordered with respect to each other, and so
551+
// will not emit the correct waitcnt operations that these fences are
552+
// intended to produce. Therefore, we use a broader type of fence and rely
553+
// on the MMRA to relax it to the semantics we want.
554+
StringRef scope = "workgroup";
555+
556+
auto relFence = LLVM::FenceOp::create(rewriter, loc,
557+
LLVM::AtomicOrdering::release, scope);
558+
relFence->setDiscardableAttr(LLVM::LLVMDialect::getMmraAttrName(), mmra);
541559
if (requiresInlineAsm) {
542560
auto asmDialectAttr = LLVM::AsmDialectAttr::get(rewriter.getContext(),
543561
LLVM::AsmDialect::AD_ATT);
544-
const char *asmStr =
545-
";;;WARNING: BREAKS DEBUG WATCHES\ns_waitcnt lgkmcnt(0)\ns_barrier";
562+
const char *asmStr = ";;;WARNING: BREAKS DEBUG WATCHES\ns_barrier";
546563
const char *constraints = "";
547-
rewriter.replaceOpWithNewOp<LLVM::InlineAsmOp>(
548-
op,
564+
LLVM::InlineAsmOp::create(
565+
rewriter, loc,
549566
/*resultTypes=*/TypeRange(), /*operands=*/ValueRange(),
550567
/*asm_string=*/asmStr, constraints, /*has_side_effects=*/true,
551568
/*is_align_stack=*/false, LLVM::TailCallKind::None,
552569
/*asm_dialect=*/asmDialectAttr,
553570
/*operand_attrs=*/ArrayAttr());
554-
return success();
555-
}
556-
if (chipset.majorVersion < 12) {
557-
constexpr int32_t ldsOnlyBitsGfx6789 = ~(0x1f << 8);
558-
constexpr int32_t ldsOnlyBitsGfx10 = ~(0x3f << 8);
559-
// Left in place in case someone disables the inline ASM path or future
560-
// chipsets use the same bit pattern.
561-
constexpr int32_t ldsOnlyBitsGfx11 = ~(0x3f << 4);
562-
563-
int32_t ldsOnlyBits;
564-
if (chipset.majorVersion == 11)
565-
ldsOnlyBits = ldsOnlyBitsGfx11;
566-
else if (chipset.majorVersion == 10)
567-
ldsOnlyBits = ldsOnlyBitsGfx10;
568-
else if (chipset.majorVersion <= 9)
569-
ldsOnlyBits = ldsOnlyBitsGfx6789;
570-
else
571-
return op.emitOpError(
572-
"don't know how to lower this for chipset major version")
573-
<< chipset.majorVersion;
574-
575-
Location loc = op->getLoc();
576-
ROCDL::SWaitcntOp::create(rewriter, loc, ldsOnlyBits);
577-
rewriter.replaceOpWithNewOp<ROCDL::SBarrierOp>(op);
571+
} else if (chipset.majorVersion < 12) {
572+
ROCDL::SBarrierOp::create(rewriter, loc);
578573
} else {
579-
Location loc = op->getLoc();
580-
ROCDL::WaitDscntOp::create(rewriter, loc, 0);
581574
ROCDL::BarrierSignalOp::create(rewriter, loc, -1);
582-
rewriter.replaceOpWithNewOp<ROCDL::BarrierWaitOp>(op, -1);
575+
ROCDL::BarrierWaitOp::create(rewriter, loc, -1);
583576
}
584577

578+
auto acqFence = LLVM::FenceOp::create(rewriter, loc,
579+
LLVM::AtomicOrdering::acquire, scope);
580+
acqFence->setDiscardableAttr(LLVM::LLVMDialect::getMmraAttrName(), mmra);
581+
rewriter.replaceOp(op, acqFence);
585582
return success();
586583
}
587584
};

mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
// Note: #gpu.address_space<global> is hardcoded to `1` here because the
99
// test pass doesn't set up the GPU address space conversions.
1010

11+
// CHECK: #[[$MMRA_TAG:.+]] = #llvm.mmra_tag<"amdgpu-synchronize-as":"local">
12+
1113
#gpu_global_addrspace = 1
1214

1315
// CHECK-LABEL: func @fat_raw_buffer_cast
@@ -414,19 +416,16 @@ func.func @amdgpu_raw_buffer_atomic_cmpswap_v2f16(%src : vector<2xf16>, %cmp : v
414416

415417
// CHECK-LABEL: func @lds_barrier
416418
func.func @lds_barrier() {
419+
// CHECK: llvm.fence syncscope("workgroup") release {llvm.mmra = #[[$MMRA_TAG]]}
417420
// GFX908: llvm.inline_asm has_side_effects asm_dialect = att
418-
// GFX908-SAME: ";;;WARNING: BREAKS DEBUG WATCHES\0As_waitcnt lgkmcnt(0)\0As_barrier"
419-
// GFX90A: rocdl.s.waitcnt -7937
421+
// GFX908-SAME: ";;;WARNING: BREAKS DEBUG WATCHES\0As_barrier"
420422
// GFX90A-NEXT: rocdl.s.barrier
421-
// GFX942: rocdl.s.waitcnt -7937
422423
// GFX942-NEXT: rocdl.s.barrier
423-
// GFX10: rocdl.s.waitcnt -16129
424424
// GFX10-NEXT: rocdl.s.barrier
425-
// GFX11: llvm.inline_asm has_side_effects asm_dialect = att
426-
// GFX11-SAME: ";;;WARNING: BREAKS DEBUG WATCHES\0As_waitcnt lgkmcnt(0)\0As_barrier"
427-
// GFX12: rocdl.s.wait.dscnt 0
425+
// GFX11-NEXT: rocdl.s.barrier
428426
// GFX12-NEXT: rocdl.s.barrier.signal -1
429427
// GFX12-NEXT: rocdl.s.barrier.wait -1
428+
// CHECK-NEXT: llvm.fence syncscope("workgroup") acquire {llvm.mmra = #[[$MMRA_TAG]]}
430429
amdgpu.lds_barrier
431430
func.return
432431
}

0 commit comments

Comments
 (0)