intel
diff --git a/‎python/src/gluon_ir.cc‎
Lines changed: 3 additions & 3 deletions b/‎python/src/gluon_ir.cc‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎python/test/gluon/test_core.py‎
Lines changed: 38 additions & 0 deletions b/‎python/test/gluon/test_core.py‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎python/test/gluon/test_frontend.py‎
Lines changed: 231 additions & 8 deletions b/‎python/test/gluon/test_frontend.py‎
Lines changed: 231 additions & 8 deletions
@@ -403,11 +403,11 @@ void init_gluon_ir(py::module &&m) {
            })
       .def("create_async_copy_global_to_local",
            [](GluonOpBuilder &self, Value smem, Value pointer, Value mask,
-              tt::CacheModifier cacheModifier,
+              Value other, tt::CacheModifier cacheModifier,
               tt::EvictionPolicy evictionPolicy, bool isVolatile) {
              self.create<ttg::AsyncCopyGlobalToLocalOp>(
-                 pointer, smem, mask,
-                 /*other*/ Value{}, cacheModifier, evictionPolicy, isVolatile);
+                 pointer, smem, mask, other, cacheModifier, evictionPolicy,
+                 isVolatile);
            })
       .def("create_async_copy_mbarrier_arrive",
            [](GluonOpBuilder &self, Value mbarrier, bool incrementCount) {
 
@@ -1,5 +1,6 @@
 import torch
 import pytest
+import re
 
 import triton
 import triton.language as tl
@@ -10,6 +11,7 @@
 from triton.experimental.gluon.language.nvidia.ampere import async_copy, mbarrier
 from triton.experimental.gluon.language.nvidia.hopper import tma, fence_async_shared
 from triton.experimental.gluon.language.nvidia import hopper
+from triton.experimental.gluon.language.amd.cdna4 import async_copy as cdna4_async_copy
 from triton.experimental.gluon.language.extra import libdevice
 
 
@@ -149,6 +151,42 @@ def test_warpgroup_mma(ASYNC):
     torch.testing.assert_close(out, ref, atol=1e-3, rtol=1e-1)
 
 
+@pytest.mark.skipif(not is_hip_cdna4(), reason="Requires CDNA4")
+@pytest.mark.parametrize("use_buffer_load", [True, False])
+def test_amd_direct_load_to_shared(use_buffer_load):
+
+    @gluon.jit
+    def kernel(a_ptr, b_ptr, use_buffer_load: ttgl.constexpr):
+        blocked: ttgl.constexpr = ttgl.BlockedLayout([1, 8], [32, 2], [4, 1], [1, 0])
+        shared: ttgl.constexpr = ttgl.SwizzledSharedLayout(1, 1, 1, order=[1, 0])
+
+        smem = ttgl.allocate_shared_memory(a_ptr.dtype.element_ty, [128, 16], shared)
+        offsets = ttgl.arange(0, 128, layout=ttgl.SliceLayout(1, blocked))[:, None] * 16 + \
+                  ttgl.arange(0, 16, layout=ttgl.SliceLayout(0, blocked))[None, :]
+        if use_buffer_load:
+            cdna4_async_copy.buffer_load_to_shared(smem, a_ptr, offsets)
+        else:
+            cdna4_async_copy.global_load_to_shared(smem, a_ptr + offsets)
+
+        cdna4_async_copy.async_wait(0)
+        a = cdna4_async_copy.load_shared_relaxed(smem, blocked)
+
+        ttgl.store(b_ptr + offsets, a)
+
+    torch.manual_seed(0)
+    a = torch.randn((128, 16), dtype=torch.float16, device='cuda')
+    b = torch.empty_like(a)
+    pgm = kernel[(1, )](a, b, use_buffer_load)
+
+    torch.testing.assert_close(a, b)
+    assert re.search(r'ttg\.local_load .* \{ttg\.amdgpu\.syncedViaAsyncWait = true\}', pgm.asm['ttgir'], re.MULTILINE)
+    if use_buffer_load:
+        assert re.search(r"buffer_load.*lds$", pgm.asm['amdgcn'], re.MULTILINE)
+    else:
+        assert re.search(r"global_load_lds", pgm.asm['amdgcn'], re.MULTILINE)
+    assert 'vmcnt(0)' in pgm.asm['amdgcn']
+
+
 @pytest.mark.parametrize("M, N, K", [(32, 32, 16), (16, 16, 32)])
 @pytest.mark.parametrize("in_dtype", ['float16', 'bfloat16'])
 @pytest.mark.parametrize("num_warps", [4, 8])
 
@@ -11,6 +11,7 @@
 from triton.experimental.gluon.language.nvidia.blackwell import mbarrier, tma, TensorMemoryLayout, async_copy
 from triton.experimental.gluon.nvidia.hopper import TensorDescriptor
 from triton.experimental.gluon.language.amd import _layouts as amd_layouts
+from triton.experimental.gluon.language.amd.cdna4 import async_copy as cdna4_async_copy
 from triton.experimental.gluon.language.extra import libdevice
 
 from triton._filecheck import filecheck_test, run_parser
@@ -1590,7 +1591,175 @@ def test_infer_layout_for_amd_mfma(target):
 """)
 
 
-@pytest.mark.parametrize("target", [HIP_TARGET_CDNA3, HIP_TARGET_CDNA4])
+@pytest.mark.parametrize("target", [HIP_TARGET_CDNA4])
+def test_amd_load_shared_relaxed(target):
+
+    @gluon.jit
+    def kernel():
+        blocked: ttgl.constexpr = ttgl.BlockedLayout([1, 8], [32, 2], [4, 1], [1, 0])
+        shared: ttgl.constexpr = ttgl.SwizzledSharedLayout(1, 1, 1, order=[1, 0])
+
+        smem = ttgl.allocate_shared_memory(ttgl.float16, [128, 16], shared)
+        cdna4_async_copy.load_shared_relaxed(smem, blocked)
+
+    mod = run_parser(kernel, target=target)
+    expecttest.assert_expected_inline(
+        anonymize_ir(mod.str_nodebug()), """\
+#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [32, 2], warpsPerCTA = [4, 1], order = [1, 0]}>
+#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "...", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @kernel() attributes {noinline = false} {
+    %0 = ttg.local_alloc : () -> !ttg.memdesc<128x16xf16, #shared, #smem, mutable>
+    %1 = ttg.local_load %0 {ttg.amdgpu.syncedViaAsyncWait = true} : !ttg.memdesc<128x16xf16, #shared, #smem, mutable> -> tensor<128x16xf16, #blocked>
+    tt.return
+  }
+}
+""")
+
+
+@pytest.mark.parametrize("target", [HIP_TARGET_CDNA4])
+def test_amd_load_shared_relaxed_in_loop(target):
+
+    @gluon.jit
+    def kernel():
+        blocked: ttgl.constexpr = ttgl.BlockedLayout([1, 8], [32, 2], [4, 1], [1, 0])
+        shared: ttgl.constexpr = ttgl.SwizzledSharedLayout(1, 1, 1, order=[1, 0])
+
+        smem = ttgl.allocate_shared_memory(ttgl.float16, [128, 16], shared)
+        for i in range(10):
+            cdna4_async_copy.load_shared_relaxed(smem, blocked)
+
+    mod = run_parser(kernel, target=target)
+    expecttest.assert_expected_inline(
+        anonymize_ir(mod.str_nodebug()), """\
+#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [32, 2], warpsPerCTA = [4, 1], order = [1, 0]}>
+#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "...", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @kernel() attributes {noinline = false} {
+    %0 = ttg.local_alloc : () -> !ttg.memdesc<128x16xf16, #shared, #smem, mutable>
+    %c0_i32 = arith.constant 0 : i32
+    %c10_i32 = arith.constant 10 : i32
+    %c1_i32 = arith.constant 1 : i32
+    %1 = arith.bitcast %c0_i32 : i32 to i32
+    %2 = arith.bitcast %c10_i32 : i32 to i32
+    %3 = arith.bitcast %c1_i32 : i32 to i32
+    %4 = ub.poison : i32
+    scf.for %arg0 = %1 to %2 step %3  : i32 {
+      %5 = ttg.local_load %0 {ttg.amdgpu.syncedViaAsyncWait = true} : !ttg.memdesc<128x16xf16, #shared, #smem, mutable> -> tensor<128x16xf16, #blocked>
+    }
+    tt.return
+  }
+}
+""")
+
+
+@pytest.mark.parametrize("target", [HIP_TARGET_CDNA4])
+def test_amd_global_load_to_shared(target):
+
+    @gluon.jit
+    def kernel(ptr):
+        blocked: ttgl.constexpr = ttgl.BlockedLayout([1, 8], [32, 2], [4, 1], [1, 0])
+        shared: ttgl.constexpr = ttgl.SwizzledSharedLayout(1, 1, 1, order=[1, 0])
+
+        smem = ttgl.allocate_shared_memory(ptr.dtype.element_ty, [128, 16], shared)
+        offsets = ttgl.arange(0, 128, layout=ttgl.SliceLayout(1, blocked))[:, None] * 16 + \
+                  ttgl.arange(0, 16, layout=ttgl.SliceLayout(0, blocked))[None, :]
+
+        cdna4_async_copy.global_load_to_shared(smem, ptr + offsets)
+        cdna4_async_copy.async_wait(0)
+
+    ptr = MockTensor(ttgl.float16)
+    mod = run_parser(kernel, *make_args(ptr), target=target)
+    expecttest.assert_expected_inline(
+        anonymize_ir(mod.str_nodebug()), """\
+#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [32, 2], warpsPerCTA = [4, 1], order = [1, 0]}>
+#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "...", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @kernel(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}) attributes {noinline = false} {
+    %0 = ttg.local_alloc : () -> !ttg.memdesc<128x16xf16, #shared, #smem, mutable>
+    %1 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>>
+    %2 = tt.expand_dims %1 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128x1xi32, #blocked>
+    %c16_i32 = arith.constant 16 : i32
+    %c16_i32_0 = arith.constant 16 : i32
+    %cst = arith.constant dense<16> : tensor<128x1xi32, #blocked>
+    %3 = arith.muli %2, %cst : tensor<128x1xi32, #blocked>
+    %4 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>>
+    %5 = tt.expand_dims %4 {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked>
+    %6 = tt.broadcast %3 : tensor<128x1xi32, #blocked> -> tensor<128x16xi32, #blocked>
+    %7 = tt.broadcast %5 : tensor<1x16xi32, #blocked> -> tensor<128x16xi32, #blocked>
+    %8 = arith.addi %6, %7 : tensor<128x16xi32, #blocked>
+    %9 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<128x16x!tt.ptr<f16>, #blocked>
+    %10 = tt.addptr %9, %8 : tensor<128x16x!tt.ptr<f16>, #blocked>, tensor<128x16xi32, #blocked>
+    %11 = ttg.async_copy_global_to_local %10, %0 : tensor<128x16x!tt.ptr<f16>, #blocked> -> <128x16xf16, #shared, #smem, mutable>
+    %12 = ttg.async_wait {num = 0 : i32}
+    tt.return
+  }
+}
+""")
+
+
+@pytest.mark.parametrize("target", [HIP_TARGET_CDNA4])
+def test_amd_global_load_to_shared_with_broadcast(target):
+
+    @gluon.jit
+    def kernel(ptr):
+        blocked: ttgl.constexpr = ttgl.BlockedLayout([1, 8], [32, 2], [4, 1], [1, 0])
+        shared: ttgl.constexpr = ttgl.SwizzledSharedLayout(1, 1, 1, order=[1, 0])
+
+        smem = ttgl.allocate_shared_memory(ptr.dtype.element_ty, [128, 16], shared)
+        y_offset = ttgl.arange(0, 128, layout=ttgl.SliceLayout(1, blocked))
+        x_offset = ttgl.arange(0, 16, layout=ttgl.SliceLayout(0, blocked))
+        offsets = y_offset[:, None] * 16 + x_offset[None, :]
+
+        mask = (y_offset < 64)[:, None]
+        other = tl.cast(0.0, ptr.dtype.element_ty)
+
+        cdna4_async_copy.global_load_to_shared(smem, ptr + offsets, mask, other)
+        cdna4_async_copy.async_wait(0)
+
+    ptr = MockTensor(ttgl.float16)
+    mod = run_parser(kernel, *make_args(ptr), target=target)
+    expecttest.assert_expected_inline(
+        anonymize_ir(mod.str_nodebug()), """\
+#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [32, 2], warpsPerCTA = [4, 1], order = [1, 0]}>
+#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "...", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @kernel(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}) attributes {noinline = false} {
+    %0 = ttg.local_alloc : () -> !ttg.memdesc<128x16xf16, #shared, #smem, mutable>
+    %1 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>>
+    %2 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>>
+    %3 = tt.expand_dims %1 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128x1xi32, #blocked>
+    %c16_i32 = arith.constant 16 : i32
+    %c16_i32_0 = arith.constant 16 : i32
+    %cst = arith.constant dense<16> : tensor<128x1xi32, #blocked>
+    %4 = arith.muli %3, %cst : tensor<128x1xi32, #blocked>
+    %5 = tt.expand_dims %2 {axis = 0 : i32} : tensor<16xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked>
+    %6 = tt.broadcast %4 : tensor<128x1xi32, #blocked> -> tensor<128x16xi32, #blocked>
+    %7 = tt.broadcast %5 : tensor<1x16xi32, #blocked> -> tensor<128x16xi32, #blocked>
+    %8 = arith.addi %6, %7 : tensor<128x16xi32, #blocked>
+    %c64_i32 = arith.constant 64 : i32
+    %cst_1 = arith.constant dense<64> : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>>
+    %9 = arith.cmpi slt, %1, %cst_1 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked}>>
+    %10 = tt.expand_dims %9 {axis = 1 : i32} : tensor<128xi1, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<128x1xi1, #blocked>
+    %cst_2 = arith.constant 0.000000e+00 : f32
+    %11 = arith.truncf %cst_2 : f32 to f16
+    %12 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<128x16x!tt.ptr<f16>, #blocked>
+    %13 = tt.addptr %12, %8 : tensor<128x16x!tt.ptr<f16>, #blocked>, tensor<128x16xi32, #blocked>
+    %14 = tt.broadcast %10 : tensor<128x1xi1, #blocked> -> tensor<128x16xi1, #blocked>
+    %15 = tt.splat %11 : f16 -> tensor<128x16xf16, #blocked>
+    %16 = ttg.async_copy_global_to_local %13, %0 mask %14 other %15 : tensor<128x16x!tt.ptr<f16>, #blocked> -> <128x16xf16, #shared, #smem, mutable>
+    %17 = ttg.async_wait {num = 0 : i32}
+    tt.return
+  }
+}
+""")
+
+
+@pytest.mark.parametrize("target", [HIP_TARGET_CDNA4])
 def test_buffer_load_to_shared(target):
 
     @gluon.jit
@@ -1601,7 +1770,7 @@ def kernel(ptr):
         dest = ttgl.allocate_shared_memory(ptr.dtype.element_ty, [256], shared)
         offsets = ttgl.arange(0, 256, layout=blocked)
 
-        ttgl.amd.cdna3.buffer_load_to_shared(dest, ptr, offsets)
+        cdna4_async_copy.buffer_load_to_shared(dest, ptr, offsets)
 
     ptr = MockTensor(ttgl.float32)
     mod = run_parser(kernel, *make_args(ptr), target=target)
@@ -1621,7 +1790,61 @@ def kernel(ptr):
 """)
 
 
-@pytest.mark.parametrize("target", [HIP_TARGET_CDNA3, HIP_TARGET_CDNA4])
+@pytest.mark.parametrize("target", [HIP_TARGET_CDNA4])
+def test_buffer_load_to_shared_with_broadcast(target):
+
+    @gluon.jit
+    def kernel(ptr):
+        blocked1: ttgl.constexpr = ttgl.BlockedLayout([1, 1], [1, 64], [4, 1], [1, 0])
+        shared: ttgl.constexpr = ttgl.SwizzledSharedLayout(1, 1, 1, order=[1, 0])
+
+        dest = ttgl.allocate_shared_memory(ptr.dtype.element_ty, [4, 64], shared)
+
+        y_index = ttgl.arange(0, 4, layout=ttgl.SliceLayout(1, blocked1))
+        x_index = ttgl.arange(0, 64, layout=ttgl.SliceLayout(0, blocked1))
+        offsets = y_index[:, None] * 64 + x_index[None, :]
+
+        mask = (y_index < 2)[:, None]
+        other = 0.0
+
+        cdna4_async_copy.buffer_load_to_shared(dest, ptr, offsets, mask, other)
+
+    ptr = MockTensor(ttgl.float32)
+    mod = run_parser(kernel, *make_args(ptr), target=target)
+    expecttest.assert_expected_inline(
+        anonymize_ir(mod.str_nodebug()), """\
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [4, 1], order = [1, 0]}>
+#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "...", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @kernel(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}) attributes {noinline = false} {
+    %0 = ttg.local_alloc : () -> !ttg.memdesc<4x64xf32, #shared, #smem, mutable>
+    %1 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked}>>
+    %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>>
+    %3 = tt.expand_dims %1 {axis = 1 : i32} : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<4x1xi32, #blocked>
+    %c64_i32 = arith.constant 64 : i32
+    %c64_i32_0 = arith.constant 64 : i32
+    %cst = arith.constant dense<64> : tensor<4x1xi32, #blocked>
+    %4 = arith.muli %3, %cst : tensor<4x1xi32, #blocked>
+    %5 = tt.expand_dims %2 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x64xi32, #blocked>
+    %6 = tt.broadcast %4 : tensor<4x1xi32, #blocked> -> tensor<4x64xi32, #blocked>
+    %7 = tt.broadcast %5 : tensor<1x64xi32, #blocked> -> tensor<4x64xi32, #blocked>
+    %8 = arith.addi %6, %7 : tensor<4x64xi32, #blocked>
+    %c2_i32 = arith.constant 2 : i32
+    %cst_1 = arith.constant dense<2> : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked}>>
+    %9 = arith.cmpi slt, %1, %cst_1 : tensor<4xi32, #ttg.slice<{dim = 1, parent = #blocked}>>
+    %10 = tt.expand_dims %9 {axis = 1 : i32} : tensor<4xi1, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<4x1xi1, #blocked>
+    %cst_2 = arith.constant 0.000000e+00 : f32
+    %11 = tt.broadcast %10 : tensor<4x1xi1, #blocked> -> tensor<4x64xi1, #blocked>
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<4x64xf32, #blocked>
+    %12 = amdgpu.buffer_load_to_local %arg0[%8] mask = %11 other = %cst_3 into %0 : <f32>[tensor<4x64xi32, #blocked>] tensor<4x64xf32, #blocked> -> <4x64xf32, #shared, #smem, mutable>
+    tt.return
+  }
+}
+""")
+
+
+@pytest.mark.parametrize("target", [HIP_TARGET_CDNA4])
 def test_buffer_load_to_shared_mask_other(target):
 
     @gluon.jit
@@ -1634,7 +1857,7 @@ def kernel(ptr):
 
         mask = ttgl.full([256], 1, ttgl.int1, layout=blocked)
         other = ttgl.full([256], 0, ptr.dtype.element_ty, layout=blocked)
-        ttgl.amd.cdna3.buffer_load_to_shared(dest, ptr, offsets, mask, other)
+        cdna4_async_copy.buffer_load_to_shared(dest, ptr, offsets, mask, other)
 
     ptr = MockTensor(ttgl.float32)
     mod = run_parser(kernel, *make_args(ptr), target=target)
@@ -1658,7 +1881,7 @@ def kernel(ptr):
 """)
 
 
-@pytest.mark.parametrize("target", [HIP_TARGET_CDNA3, HIP_TARGET_CDNA4])
+@pytest.mark.parametrize("target", [HIP_TARGET_CDNA4])
 def test_buffer_load_to_shared_cache_mods(target):
 
     @gluon.jit
@@ -1669,9 +1892,9 @@ def kernel(ptr):
         dest = ttgl.allocate_shared_memory(ptr.dtype.element_ty, [256], shared)
         offsets = ttgl.arange(0, 256, layout=blocked)
 
-        ttgl.amd.cdna3.buffer_load_to_shared(dest, ptr, offsets, cache_modifier=".ca")
-        ttgl.amd.cdna3.buffer_load_to_shared(dest, ptr, offsets, cache_modifier=".cg")
-        ttgl.amd.cdna3.buffer_load_to_shared(dest, ptr, offsets, cache_modifier=".cv")
+        cdna4_async_copy.buffer_load_to_shared(dest, ptr, offsets, cache_modifier=".ca")
+        cdna4_async_copy.buffer_load_to_shared(dest, ptr, offsets, cache_modifier=".cg")
+        cdna4_async_copy.buffer_load_to_shared(dest, ptr, offsets, cache_modifier=".cv")
 
     ptr = MockTensor(ttgl.float32)
     mod = run_parser(kernel, *make_args(ptr), target=target)