Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 30 additions & 33 deletions mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -536,52 +536,49 @@ struct LDSBarrierOpLowering : public ConvertOpToLLVMPattern<LDSBarrierOp> {
LogicalResult
matchAndRewrite(LDSBarrierOp op, LDSBarrierOp::Adaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
bool requiresInlineAsm = chipset < kGfx90a || chipset.majorVersion == 11;

Location loc = op.getLoc();
// This ensures that waits on global memory aren't introduced on
// chips that don't have the BackOffBarrier feature enabled in LLVM.
bool requiresInlineAsm = chipset < kGfx90a;

Attribute mmra =
rewriter.getAttr<LLVM::MMRATagAttr>("amdgpu-synchronize-as", "local");
// Note: while there *is* a workgroup-one-as scope, this, when combined with
// the MMRA, will lead to the fence having no effect. This is because the
// codepaths for an atomic load or store will observe that a
// one-address-space atomic to LDS requires no synchronization because
// operations on LDS are totally ordered with respect to each other, and so
// will not emit the correct waitcnt operations that these fences are
// intended to produce. Therefore, we use a broader type of fence and rely
// on the MMRA to relax it to the semantics we want.
StringRef scope = "workgroup";

auto relFence = LLVM::FenceOp::create(rewriter, loc,
LLVM::AtomicOrdering::release, scope);
relFence->setDiscardableAttr(LLVM::LLVMDialect::getMmraAttrName(), mmra);
if (requiresInlineAsm) {
auto asmDialectAttr = LLVM::AsmDialectAttr::get(rewriter.getContext(),
LLVM::AsmDialect::AD_ATT);
const char *asmStr =
";;;WARNING: BREAKS DEBUG WATCHES\ns_waitcnt lgkmcnt(0)\ns_barrier";
const char *asmStr = ";;;WARNING: BREAKS DEBUG WATCHES\ns_barrier";
const char *constraints = "";
rewriter.replaceOpWithNewOp<LLVM::InlineAsmOp>(
op,
LLVM::InlineAsmOp::create(
rewriter, loc,
/*resultTypes=*/TypeRange(), /*operands=*/ValueRange(),
/*asm_string=*/asmStr, constraints, /*has_side_effects=*/true,
/*is_align_stack=*/false, LLVM::TailCallKind::None,
/*asm_dialect=*/asmDialectAttr,
/*operand_attrs=*/ArrayAttr());
return success();
}
if (chipset.majorVersion < 12) {
constexpr int32_t ldsOnlyBitsGfx6789 = ~(0x1f << 8);
constexpr int32_t ldsOnlyBitsGfx10 = ~(0x3f << 8);
// Left in place in case someone disables the inline ASM path or future
// chipsets use the same bit pattern.
constexpr int32_t ldsOnlyBitsGfx11 = ~(0x3f << 4);

int32_t ldsOnlyBits;
if (chipset.majorVersion == 11)
ldsOnlyBits = ldsOnlyBitsGfx11;
else if (chipset.majorVersion == 10)
ldsOnlyBits = ldsOnlyBitsGfx10;
else if (chipset.majorVersion <= 9)
ldsOnlyBits = ldsOnlyBitsGfx6789;
else
return op.emitOpError(
"don't know how to lower this for chipset major version")
<< chipset.majorVersion;

Location loc = op->getLoc();
ROCDL::SWaitcntOp::create(rewriter, loc, ldsOnlyBits);
rewriter.replaceOpWithNewOp<ROCDL::SBarrierOp>(op);
} else if (chipset.majorVersion < 12) {
ROCDL::SBarrierOp::create(rewriter, loc);
} else {
Location loc = op->getLoc();
ROCDL::WaitDscntOp::create(rewriter, loc, 0);
ROCDL::BarrierSignalOp::create(rewriter, loc, -1);
rewriter.replaceOpWithNewOp<ROCDL::BarrierWaitOp>(op, -1);
ROCDL::BarrierWaitOp::create(rewriter, loc, -1);
}

auto acqFence = LLVM::FenceOp::create(rewriter, loc,
LLVM::AtomicOrdering::acquire, scope);
acqFence->setDiscardableAttr(LLVM::LLVMDialect::getMmraAttrName(), mmra);
rewriter.replaceOp(op, acqFence);
return success();
}
};
Expand Down
13 changes: 6 additions & 7 deletions mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
// Note: #gpu.address_space<global> is hardcoded to `1` here because the
// test pass doesn't set up the GPU address space conversions.

// CHECK: #[[$MMRA_TAG:.+]] = #llvm.mmra_tag<"amdgpu-synchronize-as":"local">

#gpu_global_addrspace = 1

// CHECK-LABEL: func @fat_raw_buffer_cast
Expand Down Expand Up @@ -414,19 +416,16 @@ func.func @amdgpu_raw_buffer_atomic_cmpswap_v2f16(%src : vector<2xf16>, %cmp : v

// CHECK-LABEL: func @lds_barrier
func.func @lds_barrier() {
// CHECK: llvm.fence syncscope("workgroup") release {llvm.mmra = #[[$MMRA_TAG]]}
// GFX908: llvm.inline_asm has_side_effects asm_dialect = att
// GFX908-SAME: ";;;WARNING: BREAKS DEBUG WATCHES\0As_waitcnt lgkmcnt(0)\0As_barrier"
// GFX90A: rocdl.s.waitcnt -7937
// GFX908-SAME: ";;;WARNING: BREAKS DEBUG WATCHES\0As_barrier"
// GFX90A-NEXT: rocdl.s.barrier
// GFX942: rocdl.s.waitcnt -7937
// GFX942-NEXT: rocdl.s.barrier
// GFX10: rocdl.s.waitcnt -16129
// GFX10-NEXT: rocdl.s.barrier
// GFX11: llvm.inline_asm has_side_effects asm_dialect = att
// GFX11-SAME: ";;;WARNING: BREAKS DEBUG WATCHES\0As_waitcnt lgkmcnt(0)\0As_barrier"
// GFX12: rocdl.s.wait.dscnt 0
// GFX11-NEXT: rocdl.s.barrier
// GFX12-NEXT: rocdl.s.barrier.signal -1
// GFX12-NEXT: rocdl.s.barrier.wait -1
// CHECK-NEXT: llvm.fence syncscope("workgroup") acquire {llvm.mmra = #[[$MMRA_TAG]]}
amdgpu.lds_barrier
func.return
}
Expand Down