[Gluon] Add thread_barrier and {hopper,blackwell}.fence_async_shared (#7152)

peterbell10 · web-flow · commit 3e1413492f19 · 2025-06-11T15:09:52.000Z
diff --git a/python/test/gluon/test_frontend.py b/python/test/gluon/test_frontend.py
@@ -908,3 +908,20 @@ def test_zeros():
 
     # CHECK: arith.constant dense<7> : tensor<8x8xi16, [[BLOCKED2D]]>
     ttgl.full_like(a, 7, shape=[8, 8], dtype=ttgl.int16, layout=layout_2d)
+
+
+@filecheck_test
+@gluon.jit
+def test_barrier():
+    # CHECK: gpu.barrier
+    ttgl.thread_barrier()
+
+
+@filecheck_test
+@gluon.jit
+def test_fence_async_shared():
+    # CHECK: ttng.fence_async_shared {bCluster = false}
+    blackwell.fence_async_shared()
+
+    # CHECK-NEXT: ttng.fence_async_shared {bCluster = true}
+    blackwell.fence_async_shared(cluster=True)
diff --git a/python/triton/experimental/gluon/language/_core.py b/python/triton/experimental/gluon/language/_core.py
@@ -91,6 +91,7 @@
     "tensor",
     "tuple",
     "tuple_type",
+    "thread_barrier",
     "arange",
     "full",
     "convert_layout",
@@ -313,3 +314,8 @@ def warp_specialize(args, default_partition, worker_partitions, worker_num_warps
     worker_num_regs = [_unwrap_if_constexpr(r) for r in worker_num_regs]
     return _semantic.warp_specialize(args, default_partition, worker_partitions, worker_num_warps,  #
                                      worker_num_regs, _generator)
+
+
+@builtin
+def thread_barrier(_semantic=None):
+    return _semantic.debug_barrier()
diff --git a/python/triton/experimental/gluon/language/nvidia/blackwell/__init__.py b/python/triton/experimental/gluon/language/nvidia/blackwell/__init__.py
@@ -6,18 +6,19 @@
 from triton.experimental.gluon.language._core import builtin, base_type, base_value, _unwrap_if_constexpr
 
 from . import tma
-from ..hopper import mbarrier
+from ..hopper import mbarrier, fence_async_shared
 
 if TYPE_CHECKING:
     from triton._C.libtriton.gluon_ir import GluonOpBuilder
     from triton._C.libtriton import gluon_ir as ir
     from ..._semantic import GluonSemantic
 
 __all__ = [
-    "TensorMemoryLayout",
-    "tensor_memory_descriptor",
     "allocate_tensor_memory",
+    "fence_async_shared",
     "mbarrier",
+    "tensor_memory_descriptor",
+    "TensorMemoryLayout",
     "tma",
 ]
 
diff --git a/python/triton/experimental/gluon/language/nvidia/hopper/__init__.py b/python/triton/experimental/gluon/language/nvidia/hopper/__init__.py
@@ -1,4 +1,11 @@
 from . import mbarrier
 from . import tma
+from ... import _core
 
-__all__ = ["mbarrier", "tma"]
+__all__ = ["fence_async_shared", "mbarrier", "tma"]
+
+
+@_core.builtin
+def fence_async_shared(cluster=False, _semantic=None):
+    cluster = _core._unwrap_if_constexpr(cluster)
+    _semantic.builder.create_fence_async_shared(cluster)