[Gluon] Implement TensorDescriptor kernel arguments (#7142)

peterbell10 · web-flow · commit d0c65f9338df · 2025-06-11T01:07:02.000Z
diff --git a/python/src/gluon_ir.cc b/python/src/gluon_ir.cc
@@ -270,6 +270,15 @@ void init_gluon_ir(py::module &&m) {
              assert(ty.getEncoding());
              return layoutToGluon(ty.getEncoding());
            })
+      .def("get_tensor_descriptor_layout_type",
+           [](GluonOpBuilder &self, Type blockType, bool isSigned,
+              Attribute layout) -> Type {
+             auto ctx = self.getContext();
+             auto blockTy = cast<RankedTensorType>(blockType);
+             auto blockTyLayout = RankedTensorType::get(
+                 blockTy.getShape(), blockTy.getElementType(), layout);
+             return triton::TensorDescType::get(ctx, blockTyLayout, isSigned);
+           })
       .def("create_convert_layout",
            [](GluonOpBuilder &self, Type resultTy, Value value) -> Value {
              return self.create<ttg::ConvertLayoutOp>(resultTy, value);
diff --git a/python/test/gluon/test_core.py b/python/test/gluon/test_core.py
@@ -1,8 +1,10 @@
 import torch
 import pytest
 
+from triton._internal_testing import is_cuda
 from triton.experimental import gluon
 from triton.experimental.gluon import language as ttgl
+from triton.experimental.gluon.language.nvidia.hopper import tma
 
 
 @gluon.jit
@@ -31,3 +33,29 @@ def test_copy_kernel(layout, XBLOCK):
 
     copy_kernel[(4, )](out, inp, inp.numel(), XBLOCK, layout, num_warps=layout.warps_per_cta[0])
     torch.testing.assert_close(out, inp)
+
+
+@gluon.jit
+def tma_kernel(desc):
+    layout: ttgl.constexpr = ttgl.BlockedLayout([1, 2], [4, 8], [4, 1], [1, 0])
+    value = ttgl.full(desc.block_shape, 0, desc.dtype, layout)
+    alloc = ttgl.allocate_shared_memory(desc.dtype, desc.block_shape, desc.layout, value)
+    tma.async_copy_shared_to_global(desc, [0, 0], alloc)
+    tma.store_wait(0)
+    alloc._keep_alive()
+
+
+@pytest.mark.skipif(not is_cuda() or torch.cuda.get_device_capability()[0] < 9, reason="Requires Hopper")
+def test_tma():
+    out = torch.ones((16, 16), dtype=torch.float16, device="cuda")
+    layout = ttgl.NVMMASharedLayout(
+        swizzle_byte_width=32,
+        element_bitwidth=16,
+        rank=2,
+        transposed=False,
+        fp4_padded=False,
+    )
+
+    desc = gluon.nvidia.hopper.TensorDescriptor.from_tensor(out, [16, 16], layout)
+    tma_kernel[(1, )](desc)
+    torch.testing.assert_close(out, torch.zeros_like(out))
diff --git a/python/test/gluon/test_frontend.py b/python/test/gluon/test_frontend.py
@@ -9,10 +9,10 @@
 from triton.experimental.gluon import language as ttgl
 from triton.experimental.gluon.language.nvidia import blackwell
 from triton.experimental.gluon.language.nvidia.blackwell import mbarrier, tma, TensorMemoryLayout
+from triton.experimental.gluon.nvidia.hopper import TensorDescriptor
 from triton._filecheck import filecheck_test, run_parser
 import triton.language as tl
 from triton._internal_testing import is_cuda
-from triton.tools.tensor_descriptor import TensorDescriptor
 from triton.compiler.errors import CompilationError
 
 TARGET_PAT = re.compile('ttg.target = "[^"]*"')
@@ -434,8 +434,8 @@ def test_tcgen05_mma(fresh_knobs):
 
 
 @gluon.jit
-def async_tma_kernel(input_desc, XBLOCK: ttgl.constexpr, smem_layout: ttgl.constexpr):
-    smem = ttgl.allocate_shared_memory(ttgl.float16, [XBLOCK, XBLOCK], smem_layout)
+def async_tma_kernel(input_desc, XBLOCK: ttgl.constexpr):
+    smem = ttgl.allocate_shared_memory(ttgl.float16, [XBLOCK, XBLOCK], input_desc.layout)
     bar = ttgl.allocate_shared_memory(ttgl.int64, [1], mbarrier.MBarrierLayout())
     mbarrier.init(bar, count=1)
 
@@ -455,25 +455,25 @@ def test_async_tma(fresh_knobs):
 
     input = torch.randn((1024, 1024), device="cuda", dtype=torch.float16)
     XBLOCK = 128
-    input_desc = TensorDescriptor.from_tensor(input, [XBLOCK, XBLOCK])
     shared_layout = ttgl.NVMMASharedLayout(swizzle_byte_width=128, element_bitwidth=16, rank=2)
+    input_desc = TensorDescriptor.from_tensor(input, [XBLOCK, XBLOCK], shared_layout)
 
-    h = async_tma_kernel.warmup(input_desc, XBLOCK, shared_layout, grid=(1, ), num_warps=4)
+    h = async_tma_kernel.warmup(input_desc, XBLOCK, grid=(1, ), num_warps=4)
     expecttest.assert_expected_inline(
         anonymize_ir(h.asm["source"]), """\
 #loc = loc(unknown)
 #shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 16}>
 #shared1 = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0]}>
 #smem = #ttg.shared_memory
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "...", "ttg.threads-per-warp" = 32 : i32} {
-  tt.func public @async_tma_kernel(%arg0: !tt.tensordesc<tensor<128x128xf16>> loc(unknown), %arg1: i32 loc(unknown), %arg2: i32 loc(unknown), %arg3: i64 loc(unknown), %arg4: i64 loc(unknown)) attributes {noinline = false} {
+  tt.func public @async_tma_kernel(%arg0: !tt.tensordesc<tensor<128x128xf16, #shared>> loc(unknown), %arg1: i32 loc(unknown), %arg2: i32 loc(unknown), %arg3: i64 loc(unknown), %arg4: i64 loc(unknown)) attributes {noinline = false} {
     %0 = ttg.local_alloc : () -> !ttg.memdesc<128x128xf16, #shared, #smem, mutable> loc(#loc)
     %1 = ttg.local_alloc : () -> !ttg.memdesc<1xi64, #shared1, #smem, mutable> loc(#loc)
     ttng.init_barrier %1, 1 : !ttg.memdesc<1xi64, #shared1, #smem, mutable> loc(#loc)
     %c0_i32 = arith.constant 0 : i32 loc(#loc)
     %c0_i32_0 = arith.constant 0 : i32 loc(#loc)
     %true = arith.constant true loc(#loc)
-    ttng.async_tma_copy_global_to_local %arg0[%c0_i32, %c0_i32_0] %0, %1, %true : !tt.tensordesc<tensor<128x128xf16>>, !ttg.memdesc<1xi64, #shared1, #smem, mutable> -> !ttg.memdesc<128x128xf16, #shared, #smem, mutable> loc(#loc)
+    ttng.async_tma_copy_global_to_local %arg0[%c0_i32, %c0_i32_0] %0, %1, %true : !tt.tensordesc<tensor<128x128xf16, #shared>>, !ttg.memdesc<1xi64, #shared1, #smem, mutable> -> !ttg.memdesc<128x128xf16, #shared, #smem, mutable> loc(#loc)
     %true_1 = arith.constant true loc(#loc)
     ttng.barrier_expect %1, 32768, %true_1 : !ttg.memdesc<1xi64, #shared1, #smem, mutable> loc(#loc)
     %c0_i32_2 = arith.constant 0 : i32 loc(#loc)
@@ -482,7 +482,7 @@ def test_async_tma(fresh_knobs):
     ttng.inval_barrier %1 : !ttg.memdesc<1xi64, #shared1, #smem, mutable> loc(#loc)
     %c0_i32_4 = arith.constant 0 : i32 loc(#loc)
     %c0_i32_5 = arith.constant 0 : i32 loc(#loc)
-    ttng.async_tma_copy_local_to_global %arg0[%c0_i32_4, %c0_i32_5] %0 : !tt.tensordesc<tensor<128x128xf16>>, !ttg.memdesc<128x128xf16, #shared, #smem, mutable> loc(#loc)
+    ttng.async_tma_copy_local_to_global %arg0[%c0_i32_4, %c0_i32_5] %0 : !tt.tensordesc<tensor<128x128xf16, #shared>>, !ttg.memdesc<128x128xf16, #shared, #smem, mutable> loc(#loc)
     ttng.async_tma_store_wait {pendings = 0 : i32} loc(#loc)
     tt.return loc(#loc)
   } loc(#loc)
@@ -491,8 +491,8 @@ def test_async_tma(fresh_knobs):
 
 
 @gluon.jit
-def async_tma_blackwell_kernel(input_desc, XBLOCK: ttgl.constexpr, smem_layout: ttgl.constexpr):
-    smem = ttgl.allocate_shared_memory(ttgl.float16, [XBLOCK, XBLOCK], smem_layout)
+def async_tma_blackwell_kernel(input_desc, XBLOCK: ttgl.constexpr):
+    smem = ttgl.allocate_shared_memory(ttgl.float16, [XBLOCK, XBLOCK], input_desc.layout)
     bar = ttgl.allocate_shared_memory(ttgl.int64, [1], mbarrier.MBarrierLayout())
     mbarrier.init(bar, count=1)
 
@@ -514,10 +514,10 @@ def test_async_tma_blackwell(fresh_knobs):
 
     input = torch.randn((1024, 1024), device="cuda", dtype=torch.float16)
     XBLOCK = 128
-    input_desc = TensorDescriptor.from_tensor(input, [1, XBLOCK])
     shared_layout = ttgl.NVMMASharedLayout(swizzle_byte_width=128, element_bitwidth=16, rank=2)
+    input_desc = TensorDescriptor.from_tensor(input, [1, XBLOCK], shared_layout)
 
-    h = async_tma_blackwell_kernel.warmup(input_desc, XBLOCK, shared_layout, grid=(1, ), num_warps=4)
+    h = async_tma_blackwell_kernel.warmup(input_desc, XBLOCK, grid=(1, ), num_warps=4)
     expecttest.assert_expected_inline(
         anonymize_ir(h.asm["source"]), """\
 #blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [32, 1], warpsPerCTA = [1, 4], order = [1, 0]}>
@@ -526,22 +526,22 @@ def test_async_tma_blackwell(fresh_knobs):
 #shared1 = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0]}>
 #smem = #ttg.shared_memory
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "...", "ttg.threads-per-warp" = 32 : i32} {
-  tt.func public @async_tma_blackwell_kernel(%arg0: !tt.tensordesc<tensor<1x128xf16>> loc(unknown), %arg1: i32 loc(unknown), %arg2: i32 loc(unknown), %arg3: i64 loc(unknown), %arg4: i64 loc(unknown)) attributes {noinline = false} {
+  tt.func public @async_tma_blackwell_kernel(%arg0: !tt.tensordesc<tensor<1x128xf16, #shared>> loc(unknown), %arg1: i32 loc(unknown), %arg2: i32 loc(unknown), %arg3: i64 loc(unknown), %arg4: i64 loc(unknown)) attributes {noinline = false} {
     %0 = ttg.local_alloc : () -> !ttg.memdesc<128x128xf16, #shared, #smem, mutable> loc(#loc)
     %1 = ttg.local_alloc : () -> !ttg.memdesc<1xi64, #shared1, #smem, mutable> loc(#loc)
     ttng.init_barrier %1, 1 : !ttg.memdesc<1xi64, #shared1, #smem, mutable> loc(#loc)
     %2 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>> loc(#loc)
     %true = arith.constant true loc(#loc)
     %c0_i32 = arith.constant 0 : i32 loc(#loc)
-    ttng.async_tma_gather %arg0[%2, %c0_i32] %0, %1, %true : !tt.tensordesc<tensor<1x128xf16>>, tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>>, i32, !ttg.memdesc<1xi64, #shared1, #smem, mutable>, !ttg.memdesc<128x128xf16, #shared, #smem, mutable>, i1 loc(#loc)
+    ttng.async_tma_gather %arg0[%2, %c0_i32] %0, %1, %true : !tt.tensordesc<tensor<1x128xf16, #shared>>, tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>>, i32, !ttg.memdesc<1xi64, #shared1, #smem, mutable>, !ttg.memdesc<128x128xf16, #shared, #smem, mutable>, i1 loc(#loc)
     %true_0 = arith.constant true loc(#loc)
     ttng.barrier_expect %1, 32768, %true_0 : !ttg.memdesc<1xi64, #shared1, #smem, mutable> loc(#loc)
     %c0_i32_1 = arith.constant 0 : i32 loc(#loc)
     %true_2 = arith.constant true loc(#loc)
     ttng.wait_barrier %1, %c0_i32_1, %true_2 : !ttg.memdesc<1xi64, #shared1, #smem, mutable> loc(#loc)
     ttng.inval_barrier %1 : !ttg.memdesc<1xi64, #shared1, #smem, mutable> loc(#loc)
     %c0_i32_3 = arith.constant 0 : i32 loc(#loc)
-    ttng.async_tma_scatter %arg0[%2, %c0_i32_3] %0 : !tt.tensordesc<tensor<1x128xf16>>, tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>>, i32, !ttg.memdesc<128x128xf16, #shared, #smem, mutable> loc(#loc)
+    ttng.async_tma_scatter %arg0[%2, %c0_i32_3] %0 : !tt.tensordesc<tensor<1x128xf16, #shared>>, tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked}>>, i32, !ttg.memdesc<128x128xf16, #shared, #smem, mutable> loc(#loc)
     ttng.async_tma_store_wait {pendings = 0 : i32} loc(#loc)
     tt.return loc(#loc)
   } loc(#loc)
diff --git a/python/triton/experimental/gluon/__init__.py b/python/triton/experimental/gluon/__init__.py
@@ -1,3 +1,4 @@
+from . import nvidia
 from ._runtime import jit
 
-__all__ = ["jit"]
+__all__ = ["jit", "nvidia"]
diff --git a/python/triton/experimental/gluon/language/_core.py b/python/triton/experimental/gluon/language/_core.py
@@ -16,7 +16,6 @@
     dtype,
     block_type,  # TODO: block type with layout info
     pointer_type,
-    tuple_type,
     void,
     int1,
     int8,
@@ -39,6 +38,8 @@
     _unwrap_if_constexpr,
     _unwrap_shape,
     tensor,
+    tuple,
+    tuple_type,
 )
 
 _IMPORT_FROM_TRITON: List[str] = [
@@ -88,6 +89,8 @@
     "float64",
     "_unwrap_if_constexpr",
     "tensor",
+    "tuple",
+    "tuple_type",
     "arange",
     "full",
     "convert_layout",
diff --git a/python/triton/experimental/gluon/language/_standard.py b/python/triton/experimental/gluon/language/_standard.py
@@ -2,6 +2,7 @@
 import triton
 import triton.language.standard as tl_standard
 from .._runtime import jit
+from triton import knobs
 
 _IMPORT_FROM_TRITON = [
     "sum",
@@ -16,5 +17,5 @@
 for name in _IMPORT_FROM_TRITON:
     # Convert JITFunction -> GluonJitFunction
     fn = getattr(tl_standard, name)
-    assert isinstance(fn, triton.runtime.JITFunction)
+    assert knobs.runtime.interpret or isinstance(fn, triton.runtime.JITFunction)
     globals()[name] = jit(fn.fn)
diff --git a/python/triton/experimental/gluon/language/nvidia/blackwell/tma.py b/python/triton/experimental/gluon/language/nvidia/blackwell/tma.py
@@ -3,6 +3,8 @@
     async_copy_global_to_shared,
     async_copy_shared_to_global,
     store_wait,
+    tensor_descriptor,
+    tensor_descriptor_type,
 )
 
 __all__ = [
@@ -11,6 +13,8 @@
     "async_copy_global_to_shared",
     "async_copy_shared_to_global",
     "store_wait",
+    "tensor_descriptor",
+    "tensor_descriptor_type",
 ]
 
 
diff --git a/python/triton/experimental/gluon/language/nvidia/hopper/tma.py b/python/triton/experimental/gluon/language/nvidia/hopper/tma.py
@@ -1,8 +1,81 @@
+from __future__ import annotations
+from typing import List, Tuple, TYPE_CHECKING
+from dataclasses import dataclass
+import triton.experimental.gluon.language._core as ttgl
+from triton.experimental.gluon.language._layouts import NVMMASharedLayout
 from triton.experimental.gluon.language._core import builtin, _unwrap_if_constexpr
 
+if TYPE_CHECKING:
+    from triton._C import ir
+
 __all__ = ["async_copy_global_to_shared", "async_copy_shared_to_global", "store_wait"]
 
 
+@dataclass(eq=True)
+class tensor_descriptor_type:
+    block_type: ttgl.block_type
+    shape_type: ttgl.tuple_type
+    strides_type: ttgl.tuple_type
+    layout: NVMMASharedLayout
+
+    def __str__(self) -> str:
+        return f"tensor_descriptor<{self.block_type}, {self.layout}>"
+
+    def _unflatten_ir(self, handles: List[ir.value], cursor: int) -> Tuple[tensor_descriptor, int]:
+        handle = handles[cursor]
+        cursor += 1
+        shape, cursor = self.shape_type._unflatten_ir(handles, cursor)
+        strides, cursor = self.strides_type._unflatten_ir(handles, cursor)
+        value = tensor_descriptor(handle, shape, strides, self.block_type, layout=self.layout)
+        return value, cursor
+
+    def _flatten_ir_types(self, builder: ir.builder, out: List[ir.type]) -> None:
+        is_signed = self.block_type.element_ty.is_int_signed()
+        ty = builder.get_tensor_descriptor_layout_type(
+            self.block_type.to_ir(builder),
+            is_signed,
+            self.layout._to_ir(builder),
+        )
+        out.append(ty)
+        self.shape_type._flatten_ir_types(builder, out)
+        self.strides_type._flatten_ir_types(builder, out)
+
+    def mangle(self) -> str:
+        return f"TD{self.block_type.mangle}_{self.layout.mangle()}TD"
+
+
+class tensor_descriptor:
+
+    def __init__(self, handle, shape: List[ttgl.tensor], strides: List[ttgl.tensor], block_type: ttgl.block_type,
+                 layout: NVMMASharedLayout):
+        self.handle = handle
+        self.shape = ttgl.tuple(shape)
+        self.strides = ttgl.tuple(strides)
+        self.type = tensor_descriptor_type(block_type, shape_type=self.shape.type, strides_type=self.strides.type,
+                                           layout=layout)
+
+    def _flatten_ir(self, handles: List[ir.value]) -> None:
+        handles.append(self.handle)
+        self.shape._flatten_ir(handles)
+        self.strides._flatten_ir(handles)
+
+    @property
+    def block_type(self):
+        return self.type.block_type
+
+    @property
+    def block_shape(self):
+        return self.type.block_type.shape
+
+    @property
+    def dtype(self):
+        return self.type.block_type.element_ty
+
+    @property
+    def layout(self):
+        return self.type.layout
+
+
 @builtin
 def async_copy_global_to_shared(tensor_desc, coord, barrier, result, pred=True, _semantic=None):
     coord = _semantic._convert_to_ir_values(coord, require_i64=False)
diff --git a/python/triton/experimental/gluon/nvidia/__init__.py b/python/triton/experimental/gluon/nvidia/__init__.py
@@ -0,0 +1,4 @@
+from . import hopper
+from . import blackwell
+
+__all__ = ["hopper", "blackwell"]
diff --git a/python/triton/experimental/gluon/nvidia/blackwell.py b/python/triton/experimental/gluon/nvidia/blackwell.py
@@ -0,0 +1,3 @@
+from .hopper import TensorDescriptor
+
+__all__ = ["TensorDescriptor"]
diff --git a/python/triton/experimental/gluon/nvidia/hopper.py b/python/triton/experimental/gluon/nvidia/hopper.py
@@ -0,0 +1,40 @@
+from dataclasses import dataclass
+from typing import List, Any
+from triton._utils import validate_block_shape, canonicalize_dtype, get_primitive_bitwidth
+from triton.experimental.gluon.language._layouts import NVMMASharedLayout
+
+__all__ = ["TensorDescriptor"]
+
+
+@dataclass
+class TensorDescriptor:
+    base: Any
+    shape: List[int]
+    strides: List[int]
+    block_shape: List[int]
+    layout: NVMMASharedLayout
+
+    def __post_init__(self):
+        rank = len(self.shape)
+        assert len(self.strides) == rank, f"rank mismatch: {self}"
+        assert len(self.block_shape) == rank, f"rank mismatch: {self}"
+        assert rank > 0, "rank must not be zero"
+        assert rank <= 5, "rank cannot be more than 5"
+        assert self.base.data_ptr() % 16 == 0, "base must be 16-byte aligned"
+        validate_block_shape(self.block_shape)
+        dtype_str = canonicalize_dtype(self.base.dtype)
+        elem_bytes = get_primitive_bitwidth(dtype_str) // 8
+        for stride in self.strides[:-1]:
+            assert (stride * elem_bytes) % 16 == 0, "strides must be 16-byte aligned"
+        assert self.strides[-1] == 1, "Last dimension must be contiguous"
+        assert isinstance(self.layout, NVMMASharedLayout), "Layout must be NVMMASharedLayout"
+
+    @staticmethod
+    def from_tensor(tensor: Any, block_shape: List[int], layout: NVMMASharedLayout):
+        return TensorDescriptor(
+            tensor,
+            tensor.shape,
+            tensor.stride(),
+            block_shape,
+            layout,
+        )
diff --git a/python/triton/language/__init__.py b/python/triton/language/__init__.py
@@ -287,14 +287,23 @@ def str_to_ty(name):
 
     if name.startswith("tensordesc"):
         inner = name.split("<")[1].rstrip(">")
-        dtype, block_shape = inner.split("[")
+        dtype, rest = inner.split("[", maxsplit=2)
+        block_shape, rest = rest.split("]", maxsplit=2)
         block_shape = [int(s.strip()) for s in block_shape.rstrip("]").split(",")]
+        layout = rest.lstrip(",")
+        is_gluon = len(layout)
         dtype = str_to_ty(dtype)
         ndim = len(block_shape)
         shape_type = tuple_type([int32] * ndim)
         # FIXME: Last dim stride should be constexpr(1)
         stride_type = tuple_type(([int64] * ndim))
         block = block_type(dtype, block_shape)
+        if is_gluon:
+            from triton.experimental.gluon.language._layouts import NVMMASharedLayout
+            from triton.experimental.gluon.language.nvidia.hopper.tma import tensor_descriptor_type as gluon_tensor_descriptor_type
+            layout = eval(layout, dict(NVMMASharedLayout=NVMMASharedLayout))
+            assert isinstance(layout, NVMMASharedLayout)
+            return gluon_tensor_descriptor_type(block, shape_type, stride_type, layout)
         return tensor_descriptor_type(block, shape_type, stride_type)
 
     if name == "constexpr":
diff --git a/python/triton/runtime/jit.py b/python/triton/runtime/jit.py
diff --git a/third_party/nvidia/backend/compiler.py b/third_party/nvidia/backend/compiler.py
diff --git a/third_party/nvidia/backend/driver.py b/third_party/nvidia/backend/driver.py

Original file line number	Diff line number	Diff line change
`@@ -3,6 +3,8 @@`
`3`	`3`	`async_copy_global_to_shared,`
`4`	`4`	`async_copy_shared_to_global,`
`5`	`5`	`store_wait,`
	`6`	`+ tensor_descriptor,`
	`7`	`+ tensor_descriptor_type,`
`6`	`8`	`)`
`7`	`9`
`8`	`10`	`__all__ = [`
`@@ -11,6 +13,8 @@`
`11`	`13`	`"async_copy_global_to_shared",`
`12`	`14`	`"async_copy_shared_to_global",`
`13`	`15`	`"store_wait",`
	`16`	`+ "tensor_descriptor",`
	`17`	`+ "tensor_descriptor_type",`
`14`	`18`	`]`
`15`	`19`
`16`	`20`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from .hopper import TensorDescriptor`
	`2`	`+`
	`3`	`+__all__ = ["TensorDescriptor"]`