Skip to content

Commit d146804

Browse files
schwarzschild-radiuskcloudy0717
authored andcommitted
[mlir][NVVM] Add support for few more fence Ops (llvm#170251)
This commit adds support for the following fence Ops: - fence.sync_restrict - fence.proxy.sync_restrict The commit also moves memory.barrier into the Membar/Fence section, migrates fence.mbarrier.init to intrinsics and consolidates fence related tests under nvvm/fence.mlir and nvvm/fence-invalid.mlir
1 parent 536847e commit d146804

File tree

8 files changed

+334
-110
lines changed

8 files changed

+334
-110
lines changed

mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td

Lines changed: 106 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,27 @@ def SharedSpaceAttr : EnumAttr<NVVM_Dialect, SharedSpace, "shared_space"> {
211211
let assemblyFormat = "`<` $value `>`";
212212
}
213213

214+
// Attrs describing the Memory Ordering Semantics
215+
def MemOrderKindWeak : I32EnumAttrCase<"WEAK", 0, "weak">;
216+
def MemOrderKindRelaxed : I32EnumAttrCase<"RELAXED", 1, "relaxed">;
217+
def MemOrderKindAcquire : I32EnumAttrCase<"ACQUIRE", 2, "acquire">;
218+
def MemOrderKindRelease : I32EnumAttrCase<"RELEASE", 3, "release">;
219+
def MemOrderKindAcqRel : I32EnumAttrCase<"ACQ_REL", 4, "acq_rel">;
220+
def MemOrderKindSC : I32EnumAttrCase<"SC", 5, "sc">;
221+
def MemOrderKindMMIO : I32EnumAttrCase<"MMIO", 6, "mmio">;
222+
def MemOrderKindVolatile : I32EnumAttrCase<"VOLATILE", 7, "volatile">;
223+
224+
def MemOrderKind : I32EnumAttr<"MemOrderKind", "NVVM Memory Ordering kind",
225+
[MemOrderKindWeak, MemOrderKindRelaxed, MemOrderKindAcquire,
226+
MemOrderKindRelease, MemOrderKindAcqRel, MemOrderKindSC,
227+
MemOrderKindMMIO, MemOrderKindVolatile]> {
228+
let genSpecializedAttr = 0;
229+
let cppNamespace = "::mlir::NVVM";
230+
}
231+
def MemOrderKindAttr : EnumAttr<NVVM_Dialect, MemOrderKind, "mem_order"> {
232+
let assemblyFormat = "`<` $value `>`";
233+
}
234+
214235
//===----------------------------------------------------------------------===//
215236
// NVVM intrinsic operations
216237
//===----------------------------------------------------------------------===//
@@ -1369,13 +1390,66 @@ def NVVM_ClusterWaitOp : NVVM_Op<"cluster.wait", [NVVMRequiresSM<90>]> {
13691390
let assemblyFormat = "attr-dict";
13701391
}
13711392

1393+
//===----------------------------------------------------------------------===//
1394+
// NVVM Member/Fence
1395+
//===----------------------------------------------------------------------===//
1396+
1397+
def NVVM_MembarOp : NVVM_Op<"memory.barrier">,
1398+
Arguments<(ins MemScopeKindAttr:$scope)> {
1399+
let summary = "Memory barrier operation";
1400+
let description = [{
1401+
`membar` operation guarantees that prior memory accesses requested by this
1402+
thread are performed at the specified `scope`, before later memory
1403+
operations requested by this thread following the membar instruction.
1404+
1405+
[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-membar)
1406+
}];
1407+
1408+
let assemblyFormat = "$scope attr-dict";
1409+
let llvmBuilder = [{
1410+
createIntrinsicCall(builder, getMembarIntrinsicID($scope));
1411+
}];
1412+
}
1413+
13721414
def NVVM_FenceScClusterOp : NVVM_Op<"fence.sc.cluster"> {
13731415
string llvmBuilder = [{
13741416
createIntrinsicCall(builder, llvm::Intrinsic::nvvm_fence_sc_cluster);
13751417
}];
13761418
let assemblyFormat = "attr-dict";
13771419
}
13781420

1421+
def NVVM_FenceSyncRestrictOp : NVVM_Op<"fence.sync_restrict">,
1422+
Arguments<(ins MemOrderKindAttr:$order)> {
1423+
let summary = "Uni-directional thread fence operation";
1424+
let description = [{
1425+
The `nvvm.fence.sync_restrict` Op restricts the class of memory
1426+
operations for which the fence instruction provides the memory ordering guarantees.
1427+
`sync_restrict` restricts `acquire` memory semantics to `shared_cluster` and
1428+
`release` memory semantics to `shared_cta` with cluster scope.
1429+
[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar)
1430+
}];
1431+
1432+
let assemblyFormat = "attr-dict";
1433+
let llvmBuilder = [{
1434+
createIntrinsicCall(builder, getFenceSyncRestrictID($order));
1435+
}];
1436+
1437+
let hasVerifier = 1;
1438+
}
1439+
1440+
def NVVM_FenceMbarrierInitOp : NVVM_Op<"fence.mbarrier.init"> {
1441+
let description = [{
1442+
Fence operation that applies on the prior nvvm.mbarrier.init
1443+
1444+
[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar)
1445+
}];
1446+
1447+
let assemblyFormat = "attr-dict";
1448+
let llvmBuilder = [{
1449+
createIntrinsicCall(builder, llvm::Intrinsic::nvvm_fence_mbarrier_init_release_cluster);
1450+
}];
1451+
}
1452+
13791453
def ProxyAlias : I32EnumAttrCase<"alias", 0, "alias">;
13801454
def ProxyAsync : I32EnumAttrCase<"async", 1, "async">;
13811455
def ProxyAsyncGlobal : I32EnumAttrCase<"async_global", 2, "async.global">;
@@ -1389,10 +1463,15 @@ def ProxyKind : I32EnumAttr<"ProxyKind", "Proxy kind",
13891463
}
13901464

13911465
def ProxyKindAttr : EnumAttr<NVVM_Dialect, ProxyKind, "proxy_kind"> {
1466+
let description = [{
1467+
ProxyKind attribute represents a memory proxy which is an abstract label
1468+
applied to a method of memory access. When two memory operations use distinct
1469+
methods of memory access, they are said to be different proxies.
1470+
}];
13921471
let assemblyFormat = "`<` $value `>`";
13931472
}
13941473

1395-
def NVVM_FenceProxyOp : NVVM_PTXBuilder_Op<"fence.proxy">,
1474+
def NVVM_FenceProxyOp : NVVM_Op<"fence.proxy">,
13961475
Arguments<(ins ProxyKindAttr:$kind,
13971476
OptionalAttr<SharedSpaceAttr>:$space)> {
13981477
let description = [{
@@ -1403,16 +1482,11 @@ def NVVM_FenceProxyOp : NVVM_PTXBuilder_Op<"fence.proxy">,
14031482
}];
14041483

14051484
let assemblyFormat = "attr-dict";
1406-
let extraClassDefinition = [{
1407-
std::string $cppClass::getPtx() {
1408-
std::string ptx = "fence.proxy.";
1409-
ptx += stringifyProxyKind(getKind());
1410-
if(getKind() == NVVM::ProxyKind::async_shared)
1411-
{ ptx += "::"; ptx += stringifySharedSpace(getSpace().value()); }
1412-
ptx += ";";
1413-
return ptx;
1414-
}
1485+
1486+
let llvmBuilder = [{
1487+
createIntrinsicCall(builder, getFenceProxyID($kind, $space));
14151488
}];
1489+
14161490
let hasVerifier = 1;
14171491
}
14181492

@@ -1449,23 +1523,6 @@ def NVVM_FenceProxyAcquireOp : NVVM_Op<"fence.proxy.acquire">,
14491523
let hasVerifier = 1;
14501524
}
14511525

1452-
def NVVM_MembarOp : NVVM_Op<"memory.barrier">,
1453-
Arguments<(ins MemScopeKindAttr:$scope)> {
1454-
let summary = "Memory barrier operation";
1455-
let description = [{
1456-
`membar` operation guarantees that prior memory accesses requested by this
1457-
thread are performed at the specified `scope`, before later memory
1458-
operations requested by this thread following the membar instruction.
1459-
1460-
[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-membar)
1461-
}];
1462-
1463-
let assemblyFormat = "$scope attr-dict";
1464-
let llvmBuilder = [{
1465-
createIntrinsicCall(builder, getMembarIntrinsicID($scope), {});
1466-
}];
1467-
}
1468-
14691526
def NVVM_FenceProxyReleaseOp : NVVM_Op<"fence.proxy.release">,
14701527
Arguments<(ins MemScopeKindAttr:$scope,
14711528
DefaultValuedAttr<ProxyKindAttr,
@@ -1492,6 +1549,28 @@ def NVVM_FenceProxyReleaseOp : NVVM_Op<"fence.proxy.release">,
14921549
let hasVerifier = 1;
14931550
}
14941551

1552+
def NVVM_FenceProxySyncRestrictOp : NVVM_Op<"fence.proxy.sync_restrict">,
1553+
Arguments<(ins MemOrderKindAttr:$order,
1554+
DefaultValuedAttr<ProxyKindAttr, "ProxyKind::GENERIC">:$fromProxy,
1555+
DefaultValuedAttr<ProxyKindAttr, "ProxyKind::async">:$toProxy)> {
1556+
let summary = "Uni-directional proxy fence operation with sync_restrict";
1557+
let description = [{
1558+
The `nvvm.fence.proxy.sync_restrict` Op used to establish
1559+
ordering between a prior memory access performed between proxies. Currently,
1560+
the ordering is only supported between async and generic proxies. `sync_restrict`
1561+
restricts `acquire` memory semantics to `shared_cluster` and `release` memory
1562+
semantics to `shared_cta` with cluster scope.
1563+
[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar)
1564+
}];
1565+
1566+
let assemblyFormat = "attr-dict";
1567+
let llvmBuilder = [{
1568+
createIntrinsicCall(builder, getFenceProxySyncRestrictID($order));
1569+
}];
1570+
1571+
let hasVerifier = 1;
1572+
}
1573+
14951574
def SetMaxRegisterActionIncrease : I32EnumAttrCase<"increase", 0>;
14961575
def SetMaxRegisterActionDecrease : I32EnumAttrCase<"decrease", 1>;
14971576
def SetMaxRegisterAction : I32EnumAttr<"SetMaxRegisterAction", "NVVM set max register action",
@@ -1514,22 +1593,6 @@ def NVVM_SetMaxRegisterOp : NVVM_Op<"setmaxregister"> {
15141593
}];
15151594
}
15161595

1517-
def NVVM_FenceMbarrierInitOp : NVVM_PTXBuilder_Op<"fence.mbarrier.init"> {
1518-
let arguments = (ins );
1519-
let description = [{
1520-
Fence operation that applies on the prior nvvm.mbarrier.init
1521-
1522-
[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar)
1523-
}];
1524-
1525-
let assemblyFormat = "attr-dict";
1526-
let extraClassDefinition = [{
1527-
std::string $cppClass::getPtx() {
1528-
return std::string("fence.mbarrier_init.release.cluster;");
1529-
}
1530-
}];
1531-
}
1532-
15331596
def ShflKindBfly : I32EnumAttrCase<"bfly", 0>;
15341597
def ShflKindUp : I32EnumAttrCase<"up", 1>;
15351598
def ShflKindDown : I32EnumAttrCase<"down", 2>;

mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2098,6 +2098,13 @@ bool NVVM::WgmmaMmaAsyncOp::getAsmValues(
20982098
return true; // Has manual mapping
20992099
}
21002100

2101+
LogicalResult NVVM::FenceSyncRestrictOp::verify() {
2102+
if (getOrder() != NVVM::MemOrderKind::ACQUIRE &&
2103+
getOrder() != NVVM::MemOrderKind::RELEASE)
2104+
return emitOpError("only acquire and release semantics are supported");
2105+
return success();
2106+
}
2107+
21012108
LogicalResult NVVM::FenceProxyOp::verify() {
21022109
if (getKind() == NVVM::ProxyKind::TENSORMAP)
21032110
return emitOpError() << "tensormap proxy is not a supported proxy kind";
@@ -2120,7 +2127,6 @@ LogicalResult NVVM::FenceProxyAcquireOp::verify() {
21202127
if (getToProxy() != NVVM::ProxyKind::TENSORMAP)
21212128
return emitOpError("uni-directional proxies only support tensormap "
21222129
"for to_proxy attribute");
2123-
21242130
return success();
21252131
}
21262132

@@ -2132,7 +2138,19 @@ LogicalResult NVVM::FenceProxyReleaseOp::verify() {
21322138
if (getToProxy() != NVVM::ProxyKind::TENSORMAP)
21332139
return emitOpError("uni-directional proxies only support tensormap "
21342140
"for to_proxy attribute");
2141+
return success();
2142+
}
2143+
2144+
LogicalResult NVVM::FenceProxySyncRestrictOp::verify() {
2145+
if (getOrder() != NVVM::MemOrderKind::ACQUIRE &&
2146+
getOrder() != NVVM::MemOrderKind::RELEASE)
2147+
return emitOpError("only acquire and release semantics are supported");
2148+
2149+
if (getFromProxy() != NVVM::ProxyKind::GENERIC)
2150+
return emitOpError("only generic is support for from_proxy attribute");
21352151

2152+
if (getToProxy() != NVVM::ProxyKind::async)
2153+
return emitOpError("only async is supported for to_proxy attribute");
21362154
return success();
21372155
}
21382156

mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -411,6 +411,41 @@ getTcgen05StIntrinsicID(mlir::NVVM::Tcgen05LdStShape shape, uint32_t num) {
411411
llvm_unreachable("unhandled tcgen05.st lowering");
412412
}
413413

414+
static llvm::Intrinsic::ID getFenceSyncRestrictID(NVVM::MemOrderKind order) {
415+
return order == NVVM::MemOrderKind::ACQUIRE
416+
? llvm::Intrinsic::
417+
nvvm_fence_acquire_sync_restrict_space_cluster_scope_cluster
418+
: llvm::Intrinsic::
419+
nvvm_fence_release_sync_restrict_space_cta_scope_cluster;
420+
}
421+
422+
static llvm::Intrinsic::ID
423+
getFenceProxyID(NVVM::ProxyKind kind, std::optional<NVVM::SharedSpace> space) {
424+
switch (kind) {
425+
case NVVM::ProxyKind::alias:
426+
return llvm::Intrinsic::nvvm_fence_proxy_alias;
427+
case NVVM::ProxyKind::async:
428+
return llvm::Intrinsic::nvvm_fence_proxy_async;
429+
case NVVM::ProxyKind::async_global:
430+
return llvm::Intrinsic::nvvm_fence_proxy_async_global;
431+
case NVVM::ProxyKind::async_shared:
432+
return *space == NVVM::SharedSpace::shared_cta
433+
? llvm::Intrinsic::nvvm_fence_proxy_async_shared_cta
434+
: llvm::Intrinsic::nvvm_fence_proxy_async_shared_cluster;
435+
default:
436+
llvm_unreachable("unsupported proxy kind");
437+
}
438+
}
439+
440+
static llvm::Intrinsic::ID
441+
getFenceProxySyncRestrictID(NVVM::MemOrderKind order) {
442+
return order == NVVM::MemOrderKind::ACQUIRE
443+
? llvm::Intrinsic::
444+
nvvm_fence_proxy_async_generic_acquire_sync_restrict_space_cluster_scope_cluster
445+
: llvm::Intrinsic::
446+
nvvm_fence_proxy_async_generic_release_sync_restrict_space_cta_scope_cluster;
447+
}
448+
414449
namespace {
415450
/// Implementation of the dialect interface that converts operations belonging
416451
/// to the NVVM dialect to LLVM IR.

mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir

Lines changed: 0 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -584,29 +584,6 @@ func.func @cp_async_bulk_wait_group() {
584584

585585
// -----
586586

587-
func.func @fence_mbarrier_init() {
588-
//CHECK: llvm.inline_asm has_side_effects asm_dialect = att "fence.mbarrier_init.release.cluster;"
589-
nvvm.fence.mbarrier.init
590-
func.return
591-
}
592-
// -----
593-
594-
func.func @fence_proxy() {
595-
//CHECK: llvm.inline_asm has_side_effects asm_dialect = att "fence.proxy.alias;", "" : () -> ()
596-
nvvm.fence.proxy { kind = #nvvm.proxy_kind<alias>}
597-
//CHECK: llvm.inline_asm has_side_effects asm_dialect = att "fence.proxy.async;", "" : () -> ()
598-
nvvm.fence.proxy { kind = #nvvm.proxy_kind<async>}
599-
//CHECK: llvm.inline_asm has_side_effects asm_dialect = att "fence.proxy.async.global;", "" : () -> ()
600-
nvvm.fence.proxy { kind = #nvvm.proxy_kind<async.global>}
601-
//CHECK: llvm.inline_asm has_side_effects asm_dialect = att "fence.proxy.async.shared::cta;", "" : () -> ()
602-
nvvm.fence.proxy { kind = #nvvm.proxy_kind<async.shared>, space = #nvvm.shared_space<cta>}
603-
//CHECK: llvm.inline_asm has_side_effects asm_dialect = att "fence.proxy.async.shared::cluster;", "" : () -> ()
604-
nvvm.fence.proxy { kind = #nvvm.proxy_kind<async.shared>, space = #nvvm.shared_space<cluster>}
605-
func.return
606-
}
607-
608-
// -----
609-
610587
// CHECK-LABEL: @llvm_nvvm_barrier_arrive
611588
// CHECK-SAME: (%[[barId:.*]]: i32, %[[numberOfThreads:.*]]: i32)
612589
llvm.func @llvm_nvvm_barrier_arrive(%barID : i32, %numberOfThreads : i32) {

mlir/test/Dialect/LLVMIR/nvvm.mlir

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -92,13 +92,6 @@ func.func @llvm_nvvm_cluster_wait() {
9292
llvm.return
9393
}
9494

95-
// CHECK-LABEL: @llvm_nvvm_fence_sc_cluster
96-
func.func @llvm_nvvm_fence_sc_cluster() {
97-
// CHECK: nvvm.fence.sc.cluster
98-
nvvm.fence.sc.cluster
99-
llvm.return
100-
}
101-
10295
// CHECK-LABEL: @nvvm_shfl
10396
func.func @nvvm_shfl(
10497
%arg0 : i32, %arg1 : i32, %arg2 : i32,

0 commit comments

Comments
 (0)