intel
diff --git a/‎python/test/gluon/test_core.py‎
Lines changed: 2 additions & 1 deletion b/‎python/test/gluon/test_core.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎python/test/gluon/test_frontend.py‎
Lines changed: 23 additions & 4 deletions b/‎python/test/gluon/test_frontend.py‎
Lines changed: 23 additions & 4 deletions
diff --git a/‎python/triton/experimental/gluon/language/amd/cdna4/async_copy.py‎
Lines changed: 25 additions & 10 deletions b/‎python/triton/experimental/gluon/language/amd/cdna4/async_copy.py‎
Lines changed: 25 additions & 10 deletions
@@ -541,8 +541,9 @@ def kernel(a_ptr, b_ptr, use_buffer_load: ttgl.constexpr):
             cdna4_async_copy.buffer_load_to_shared(smem, a_ptr, offsets)
         else:
             cdna4_async_copy.global_load_to_shared(smem, a_ptr + offsets)
+        cdna4_async_copy.commit_group()
 
-        cdna4_async_copy.async_wait(0)
+        cdna4_async_copy.wait_group(0)
         a = cdna4_async_copy.load_shared_relaxed(smem, blocked)
 
         ttgl.store(b_ptr + offsets, a)
 
@@ -1950,17 +1950,36 @@ def test_infer_layout_for_amd_wmma(target):
 
 
 @gluon.jit
-def amd_async_wait():
-    cdna4_async_copy.async_wait(0)
+def amd_commit_group():
+    cdna4_async_copy.commit_group()
+
+
+@pytest.mark.parametrize("target", [HIP_TARGET_CDNA4])
+def test_amd_commit_group(target):
+    mod = run_parser(amd_wait_group, target=target)
+    expecttest.assert_expected_inline(
+        anonymize_ir(mod.str_nodebug()), """\
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "...", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @amd_wait_group() attributes {noinline = false} {
+    %0 = ttg.async_wait {num = 0 : i32}
+    tt.return
+  }
+}
+""")
+
+
+@gluon.jit
+def amd_wait_group():
+    cdna4_async_copy.wait_group(0)
 
 
 @pytest.mark.parametrize("target", [HIP_TARGET_CDNA4])
 def test_amd_async_wait(target):
-    mod = run_parser(amd_async_wait, target=target)
+    mod = run_parser(amd_wait_group, target=target)
     expecttest.assert_expected_inline(
         anonymize_ir(mod.str_nodebug()), """\
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "...", "ttg.threads-per-warp" = 64 : i32} {
-  tt.func public @amd_async_wait() attributes {noinline = false} {
+  tt.func public @amd_wait_group() attributes {noinline = false} {
     %0 = ttg.async_wait {num = 0 : i32}
     tt.return
   }
 
@@ -6,7 +6,8 @@
 __all__ = [
     "global_load_to_shared",
     "buffer_load_to_shared",
-    "async_wait",
+    "commit_group",
+    "wait_group",
     "load_shared_relaxed",
 ]
 
@@ -17,7 +18,10 @@ def global_load_to_shared(dest, ptr, mask=None, other=None, cache_modifier="", _
     AMD global load to shared operation. This operation loads data directly
     from global memory to shared memory without going through registers. It
     happens asynchronously and requires a subsequent `async_wait` to ensure the
-    data is available in shared memory.
+    data is available in shared memory. Note that this operation does still
+    complete in order with ttgl.loads/stores or buffer_loads/stores on CDNA4,
+    so interleaving with them will hurt performance.
+
     Compared to `buffer_load_to_shared`, it requires a tensor pointer which
     supports 64-bit indexing range for each thread in a block, which gives more
     flexibility, but at the cost of higher register pressure and no hardware
@@ -72,7 +76,10 @@ def buffer_load_to_shared(dest, ptr, offsets, mask=None, other=None, cache_modif
     32-bit offsets instead of a tensor of pointers. This operation loads data
     directly from global memory to shared memory without going through
     registers. It happens asynchronously and requires a subsequent `async_wait`
-    to ensure the data is available in shared memory.
+    to ensure thedata is available in shared memory. Note that this operation
+    does still complete in order with ttgl.loads/stores or buffer_loads/stores
+    on CDNA4, so interleaving with them will hurt performance.
+
     Compared to `global_load_to_shared`, it has better performance and also
     supports hardware out-of-bound masking. But it strictly requires a
     32-bit offset instead of a 64-bit tensor pointer.
@@ -118,16 +125,24 @@ def buffer_load_to_shared(dest, ptr, offsets, mask=None, other=None, cache_modif
 
 
 @builtin
-def async_wait(num_outstanding=0, _semantic=None):
+def commit_group(_semantic=None):
+    """
+    Commit oustanding async operations.
+
+    This finalizes a set of async copy operations which can be waited upon via `wait_group`.
+    """
+    _semantic.builder.create_async_commit_group()
+
+
+@builtin
+def wait_group(num_outstanding=0, _semantic=None):
     """
-    Wait for outstanding memory operations, this includes normal load like
-    `load` and `buffer_load`, as well as direct load to shared memory
-    like `global_load_to_shared` and `buffer_load_to_shared`.
-    It will block until the number of outstanding memory operations is less than
-    or equal to `num_outstanding`.
+    Wait for outstanding commit groups. It will block until the number of
+    outstanding commit groups is less than or equal to `num_outstanding`. Note that uncommited
+    async operations will be waited upon even if `num_outstanding` is 0.
 
     Args:
-        num_outstanding (int): The number of outstanding operations to wait for. Defaults to 0.
+        num_outstanding (int): The number of outstanding commit groups to wait for. Defaults to 0.
     """
     num_outstanding = _unwrap_if_constexpr(num_outstanding)
     _semantic.builder.create_async_wait_group(num_outstanding)