tile-ai
diff --git a/‎_sources/autoapi/tilelang/carver/arch/cdna/index.rst.txt‎
Lines changed: 3 additions & 0 deletions b/‎_sources/autoapi/tilelang/carver/arch/cdna/index.rst.txt‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎_sources/autoapi/tilelang/carver/arch/cuda/index.rst.txt‎
Lines changed: 3 additions & 0 deletions b/‎_sources/autoapi/tilelang/carver/arch/cuda/index.rst.txt‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎_sources/autoapi/tilelang/carver/arch/metal/index.rst.txt‎
Lines changed: 3 additions & 0 deletions b/‎_sources/autoapi/tilelang/carver/arch/metal/index.rst.txt‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎_sources/autoapi/tilelang/carver/roller/policy/tensorcore/index.rst.txt‎
Lines changed: 56 additions & 1 deletion b/‎_sources/autoapi/tilelang/carver/roller/policy/tensorcore/index.rst.txt‎
Lines changed: 56 additions & 1 deletion
diff --git a/‎_sources/autoapi/tilelang/carver/template/flashattention/index.rst.txt‎
Lines changed: 5 additions & 0 deletions b/‎_sources/autoapi/tilelang/carver/template/flashattention/index.rst.txt‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎_sources/autoapi/tilelang/intrinsics/index.rst.txt‎
Lines changed: 1 addition & 0 deletions b/‎_sources/autoapi/tilelang/intrinsics/index.rst.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎_sources/autoapi/tilelang/intrinsics/tcgen05_macro_generator/index.rst.txt‎
Lines changed: 141 additions & 0 deletions b/‎_sources/autoapi/tilelang/intrinsics/tcgen05_macro_generator/index.rst.txt‎
Lines changed: 141 additions & 0 deletions
diff --git a/‎_sources/autoapi/tilelang/jit/adapter/dlpack/index.rst.txt‎
Lines changed: 5 additions & 1 deletion b/‎_sources/autoapi/tilelang/jit/adapter/dlpack/index.rst.txt‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎_sources/autoapi/tilelang/jit/kernel/index.rst.txt‎
Lines changed: 2 additions & 0 deletions b/‎_sources/autoapi/tilelang/jit/kernel/index.rst.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎_sources/autoapi/tilelang/language/allocate/index.rst.txt‎
Lines changed: 26 additions & 2 deletions b/‎_sources/autoapi/tilelang/language/allocate/index.rst.txt‎
Lines changed: 26 additions & 2 deletions
@@ -30,6 +30,9 @@ Module Contents
    Bases: :py:obj:`tilelang.carver.arch.arch_base.TileDevice`
 
 
+   Represents the architecture of a computing device, capturing various hardware specifications.
+
+
    .. py:attribute:: target
 
 
 
@@ -48,6 +48,9 @@ Module Contents
    Bases: :py:obj:`tilelang.carver.arch.arch_base.TileDevice`
 
 
+   Represents the architecture of a computing device, capturing various hardware specifications.
+
+
    .. py:attribute:: target
 
 
 
@@ -30,6 +30,9 @@ Module Contents
    Bases: :py:obj:`tilelang.carver.arch.arch_base.TileDevice`
 
 
+   Represents the architecture of a computing device, capturing various hardware specifications.
+
+
    .. py:attribute:: target
 
 
@@ -30,11 +30,15 @@ Module Contents
 
 .. py:data:: logger
 
-.. py:class:: TensorCorePolicy
+.. py:class:: TensorCorePolicy(arch, tags = None)
 
    Bases: :py:obj:`tilelang.carver.roller.policy.default.DefaultPolicy`
 
 
+   Default Policy for fastdlight, a heuristic plan that tries to
+   minimize memory traffic and maximize parallelism.for BitBLAS Schedule.
+
+
    .. py:attribute:: wmma_k
       :type:  int
       :value: 16
@@ -61,16 +65,67 @@ Module Contents
 
    .. py:method:: infer_node_smem_usage(td, node)
 
+      Infers the shared memory usage of a node given a TileDict configuration.
+
+      :param td: The TileDict object containing the tile configuration.
+      :type td: TileDict
+      :param node: The node for which to infer the shared memory usage.
+      :type node: PrimFuncNode
+
+      :returns: The estimated amount of shared memory used by the node.
+      :rtype: int
+
+
 
    .. py:method:: get_node_reduce_step_candidates(node)
 
+      Calculates reduction step candidates for each reduction axis in a PrimFuncNode. General idea : use factor first, since it does not require extra boundary check. for large prime number, which is rare case, use power of 2.
+
+      :param node: The node for which to calculate reduction step candidates. It contains reduction axes (raxis)
+                   with their domains (dom.extent).
+      :type node: PrimFuncNode
+
+      :returns: A dictionary mapping axis variable names to lists of step candidates. For each axis in the node,
+                this function calculates possible step sizes. For axes with a large prime domain, it uses powers of 2
+                as step candidates; for others, it uses all factors of the domain.
+      :rtype: Dict[str, List[int]]
+
+
 
    .. py:method:: check_tile_shape_isvalid(td)
 
+      Checks if the tile shapes in the TileDict are valid for the nodes in this context.
+
+      Parameters:
+      - td (TileDict): The TileDict object containing tile shapes and other configurations.
+
+      Returns:
+      - bool: True if all tile shapes are valid, False otherwise.
+
+
 
    .. py:method:: compute_node_stride_map(node, td)
 
+      Computes the stride map for a given node based on the TileDict configuration.
+
+      :param node: The node for which to compute the stride map.
+      :type node: PrimFuncNode
+      :param td: The TileDict object containing the tile configuration.
+      :type td: TileDict
+
+      :returns: A tuple of dictionaries containing the output strides and tensor strides.
+      :rtype: Tuple[Dict, Dict]
+
+
 
    .. py:method:: plan_rasterization(td)
 
+      Plans the rasterization for the given TileDict. This function is not implemented yet.
+
+      :param td: The TileDict object to plan rasterization for.
+      :type td: TileDict
+
+      :raises RasterRationPlan: This function is not implemented yet.
+
+
 
@@ -20,6 +20,11 @@ Module Contents
    Bases: :py:obj:`tilelang.carver.template.base.BaseTemplate`
 
 
+   Base class template for hardware-aware configurations.
+   This serves as an abstract base class (ABC) that defines the structure
+   for subclasses implementing hardware-specific optimizations.
+
+
    .. py:attribute:: batch_size
       :type:  int
       :value: 1
 
@@ -14,6 +14,7 @@ Submodules
    /autoapi/tilelang/intrinsics/mfma_macro_generator/index
    /autoapi/tilelang/intrinsics/mma_layout/index
    /autoapi/tilelang/intrinsics/mma_macro_generator/index
+   /autoapi/tilelang/intrinsics/tcgen05_macro_generator/index
    /autoapi/tilelang/intrinsics/utils/index
    /autoapi/tilelang/intrinsics/wgmma_macro_generator/index
 
 
@@ -0,0 +1,141 @@
+tilelang.intrinsics.tcgen05_macro_generator
+===========================================
+
+.. py:module:: tilelang.intrinsics.tcgen05_macro_generator
+
+
+Attributes
+----------
+
+.. autoapisummary::
+
+   tilelang.intrinsics.tcgen05_macro_generator.lift
+
+
+Classes
+-------
+
+.. autoapisummary::
+
+   tilelang.intrinsics.tcgen05_macro_generator.SwizzleMode
+   tilelang.intrinsics.tcgen05_macro_generator.TensorCoreIntrinEmitter
+
+
+Module Contents
+---------------
+
+.. py:data:: lift
+
+.. py:class:: SwizzleMode
+
+   Bases: :py:obj:`enum.IntEnum`
+
+
+   Enum where members are also (and must be) ints
+
+
+   .. py:attribute:: NONE
+      :value: 0
+
+
+
+   .. py:attribute:: SWIZZLE_128B
+      :value: 2
+
+
+
+   .. py:attribute:: SWIZZLE_64B
+      :value: 4
+
+
+
+   .. py:attribute:: SWIZZLE_32B
+      :value: 6
+
+
+
+   .. py:method:: is_none()
+
+
+   .. py:method:: is_swizzle_32b()
+
+
+   .. py:method:: is_swizzle_64b()
+
+
+   .. py:method:: is_swizzle_128b()
+
+
+   .. py:method:: swizzle_byte_size()
+
+
+   .. py:method:: swizzle_atom_size()
+
+
+.. py:class:: TensorCoreIntrinEmitter(a_dtype = 'float16', b_dtype = 'float16', accum_dtype = 'float16', a_transposed = False, b_transposed = False, block_row_warps = 2, block_col_warps = 2, warp_row_tiles = 8, warp_col_tiles = 8, chunk = 16, reduce_k = 1, num_elems_per_byte = 1, is_m_first = False, thread_var = None)
+
+   Bases: :py:obj:`tilelang.intrinsics.mma_macro_generator.TensorCoreIntrinEmitter`
+
+
+   To eliminate Python syntax within TIR Macro.
+
+
+   .. py:attribute:: tcgen05_prefix
+      :type:  str
+
+
+   .. py:attribute:: a_shared_layout
+      :type:  tilelang.layout.Layout
+      :value: None
+
+
+
+   .. py:attribute:: b_shared_layout
+      :type:  tilelang.layout.Layout
+      :value: None
+
+
+
+   .. py:method:: tcgen05mma(A_buf, B_buf, C_local_buf, mbar, clear_accum = False)
+
+
+   .. py:method:: make_mma_load_layout(local_buf, matrix = 'A')
+      :abstractmethod:
+
+
+      Create a layout function for storing MMA results into a fragment buffer.
+      This layout is used in conjunction with `inverse_mma_store_layout` to
+      map fragment indices to threads and local indices.
+
+      :param local_buf: The local buffer representing a fragment of a matrix.
+      :type local_buf: tir.Buffer
+
+      :returns: A fragment object that describes how threads and indices
+                in `local_buf` are laid out.
+      :rtype: T.Fragment
+
+      :raises AssertionError: If `local_buf` is not detected to be a fragment buffer.
+
+
+
+   .. py:method:: make_mma_store_layout(tmem_buf)
+
+      Create the TCGEN5 tensor-memory layout used to store MMA accumulators.
+
+      :param tmem_buf: The local buffer representing tensormemory of a mma's output
+      :type tmem_buf: tir.Buffer
+
+      :returns: Layout object describing how logical (i, j) coordinates map to the
+                swizzled tensor-memory offsets required by TCGEN5MMA.
+      :rtype: Layout
+
+      :raises AssertionError: If `tmem_buf` is not detected to be a tensor-memory buffer.
+
+
+
+   .. py:method:: get_tcgen5_mma_meta(m, n, k)
+
+
+   .. py:method:: get_tcgen5_instr_desc(atom_m, atom_n, atom_k, a_is_k_major, b_is_k_major, scale_in_a, scale_in_b)
+
+
@@ -20,8 +20,12 @@ Classes
 Module Contents
 ---------------
 
-.. py:class:: TorchDLPackKernelAdapter
+.. py:class:: TorchDLPackKernelAdapter(mod, params, result_idx)
 
    Bases: :py:obj:`tilelang.jit.adapter.base.BaseKernelAdapter`
 
 
+   Helper class that provides a standard way to create an ABC using
+   inheritance.
+
+
@@ -116,6 +116,8 @@ Module Contents
 
 
    .. py:attribute:: target
+      :value: 'auto'
+
 
 
    .. py:method:: from_database(func, kernel_global_source, kernel_lib_path, params, target, target_host, out_idx, execution_backend, pass_configs = None, compile_flags = None)
 
@@ -22,6 +22,14 @@ tilelang.language.allocate
 
 
 
+Attributes
+----------
+
+.. autoapisummary::
+
+   tilelang.language.allocate.DescKind
+
+
 Functions
 ---------
 
@@ -35,6 +43,10 @@ Functions
    tilelang.language.allocate.alloc_tmem
    tilelang.language.allocate.alloc_reducer
    tilelang.language.allocate.alloc_descriptor
+   tilelang.language.allocate.alloc_wgmma_desc
+   tilelang.language.allocate.alloc_tcgen05_smem_desc
+   tilelang.language.allocate.alloc_tcgen05_instruction_desc
+   tilelang.language.allocate.alloc_tcgen05_instr_desc
 
 
 Module Contents
@@ -181,11 +193,23 @@ Module Contents
    :rtype: T.Buffer
 
 
-.. py:function:: alloc_descriptor(dtype='uint64', scope='local.descriptor')
+.. py:data:: DescKind
+
+.. py:function:: alloc_descriptor(kind = 'wgmma', dtype = 'uint64')
 
-   Allocate a descriptor buffer for wgmma and utcmma.
+   Allocate a descriptor buffer for WGMMA and TCGEN5.MMA.
+
+   :param kind: The descriptor kind, one of "wgmma", "tcgen05" ("utcmma" as alias).
 
    :returns: A TVM buffer object allocated as a descriptor
    :rtype: T.Buffer
 
 
+.. py:function:: alloc_wgmma_desc(dtype = 'uint64')
+
+.. py:function:: alloc_tcgen05_smem_desc(dtype = 'uint64')
+
+.. py:function:: alloc_tcgen05_instruction_desc(dtype = 'uint32')
+
+.. py:function:: alloc_tcgen05_instr_desc(dtype = 'uint32')
+