tile-ai
diff --git a/‎_sources/autoapi/tilelang/env/index.rst.txt‎
Lines changed: 12 additions & 0 deletions b/‎_sources/autoapi/tilelang/env/index.rst.txt‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎_sources/autoapi/tilelang/intrinsics/mma_layout/index.rst.txt‎
Lines changed: 3 additions & 0 deletions b/‎_sources/autoapi/tilelang/intrinsics/mma_layout/index.rst.txt‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎_sources/autoapi/tilelang/intrinsics/utils/index.rst.txt‎
Lines changed: 3 additions & 0 deletions b/‎_sources/autoapi/tilelang/intrinsics/utils/index.rst.txt‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎_sources/autoapi/tilelang/intrinsics/wgmma_macro_generator/index.rst.txt‎
Lines changed: 2 additions & 2 deletions b/‎_sources/autoapi/tilelang/intrinsics/wgmma_macro_generator/index.rst.txt‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎_sources/autoapi/tilelang/language/builtin/index.rst.txt‎
Lines changed: 7 additions & 3 deletions b/‎_sources/autoapi/tilelang/language/builtin/index.rst.txt‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎_sources/autoapi/tilelang/language/gemm/index.rst.txt‎
Lines changed: 2 additions & 62 deletions b/‎_sources/autoapi/tilelang/language/gemm/index.rst.txt‎
Lines changed: 2 additions & 62 deletions
diff --git a/‎_sources/autoapi/tilelang/layout/swizzle/index.rst.txt‎
Lines changed: 4 additions & 4 deletions b/‎_sources/autoapi/tilelang/layout/swizzle/index.rst.txt‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎_sources/autoapi/tilelang/tileop/gemm/gemm_base/index.rst.txt‎
Lines changed: 31 additions & 9 deletions b/‎_sources/autoapi/tilelang/tileop/gemm/gemm_base/index.rst.txt‎
Lines changed: 31 additions & 9 deletions
diff --git a/‎_sources/autoapi/tilelang/tileop/gemm/index.rst.txt‎
Lines changed: 18 additions & 40 deletions b/‎_sources/autoapi/tilelang/tileop/gemm/index.rst.txt‎
Lines changed: 18 additions & 40 deletions
@@ -250,6 +250,9 @@ Module Contents
    .. py:attribute:: TILELANG_CLEAR_CACHE
 
 
+   .. py:attribute:: TILELANG_USE_GEMM_V1
+
+
    .. py:attribute:: TILELANG_AUTO_TUNING_CPU_UTILITIES
 
 
@@ -277,6 +280,15 @@ Module Contents
    .. py:method:: is_print_on_compilation_enabled()
 
 
+   .. py:method:: use_gemm_v1()
+
+      Return True if GEMM v1 should be used based on env.
+
+      Controlled by `TILELANG_USE_GEMM_V1`. Truthy values are one of
+      {"1", "true", "yes", "on"} (case-insensitive).
+
+
+
 .. py:data:: env
 
 .. py:data:: CUDA_HOME
 
@@ -35,6 +35,7 @@ Functions
    tilelang.intrinsics.mma_layout.ldmatrix_32x16_to_shared_16x32_layout_a
    tilelang.intrinsics.mma_layout.ldmatrix_32x16_to_shared_16x32_layout_b
    tilelang.intrinsics.mma_layout.mma_store_32x8_to_shared_16x16_layout
+   tilelang.intrinsics.mma_layout.mma_store_32x2_to_shared_8x8_layout_fp64
    tilelang.intrinsics.mma_layout.shared_16x8_to_mma_a_32x4_layout
    tilelang.intrinsics.mma_layout.shared_16x8_to_mma_a_32x4_layout_trans
    tilelang.intrinsics.mma_layout.shared_16x8_to_mma_b_32x4_layout
@@ -76,6 +77,8 @@ Module Contents
 
 .. py:function:: mma_store_32x8_to_shared_16x16_layout(thread_id, local_id)
 
+.. py:function:: mma_store_32x2_to_shared_8x8_layout_fp64(thread_id, local_id)
+
 .. py:function:: shared_16x8_to_mma_a_32x4_layout(i, j)
 
 .. py:function:: shared_16x8_to_mma_a_32x4_layout_trans(i, j)
 
@@ -14,6 +14,7 @@ Functions
    tilelang.intrinsics.utils.shared_16x32_to_mma_32x16_layout
    tilelang.intrinsics.utils.shared_32x16_to_mma_32x16_layout
    tilelang.intrinsics.utils.mma_store_index_map
+   tilelang.intrinsics.utils.mma_store_index_map_fp64
    tilelang.intrinsics.utils.mfma_store_index_map
    tilelang.intrinsics.utils.get_mma_micro_size
 
@@ -31,6 +32,8 @@ Module Contents
 
 .. py:function:: mma_store_index_map(thread_id, local_id)
 
+.. py:function:: mma_store_index_map_fp64(thread_id, local_id)
+
 .. py:function:: mfma_store_index_map(thread_id, local_id)
 
 .. py:function:: get_mma_micro_size(dtype)
 
@@ -104,10 +104,10 @@ Module Contents
 
 
 
-   .. py:method:: wgmma(A_buf, B_buf, C_local_buf, clear_accum = False, wg_wait = 0)
+   .. py:method:: wgmma(A_region, B_region, C_region, clear_accum = False, wg_wait = 0)
 
 
-   .. py:method:: wgmma_rs(A_buf, B_buf, C_local_buf, clear_accum = False, wg_wait = 0)
+   .. py:method:: wgmma_rs(A_region, B_region, C_region, clear_accum = False, wg_wait = 0)
 
 
    .. py:method:: make_mma_load_layout(local_buf, matrix = 'A')
 
@@ -383,15 +383,19 @@ Module Contents
    This prevents NVCC from sinking uses of accumulator fragments past the corresponding
    WGMMA operations by issuing an empty inline assembly barrier on every register.
 
-   :param buffer_or_ptr: Buffer | PrimExpr
-                         Either a buffer representing the accumulator fragment or a pointer expression.
+   :param buffer_or_ptr: Buffer | BufferLoad | BufferRegion | PrimExpr
+                         A buffer representing the accumulator fragment, a buffer load/region
+                         that identifies a starting element within the fragment, or a pointer expression
+                         (e.g., tvm_access_ptr/address_of/typed Var).
    :param offset: int | PrimExpr
                   Element offset from the start of the accumulator fragment.
    :param num_regs: int | PrimExpr | None
                     Number of 32-bit registers to fence. If None and a Buffer is provided, it will be
                     derived from the buffer shape and dtype.
    :param dtype: str | None
-                 Data type string of the accumulator elements. Required when passing a pointer.
+                 Data type string of the accumulator elements. When passing a buffer or
+                 buffer-derived expression, dtype is inferred. It is required only when
+                 passing a raw pointer expression that cannot be inferred.
 
    :returns: A handle to the warpgroup fence operation.
    :rtype: tir.Call
 
@@ -31,72 +31,12 @@ Module Contents
 
 .. py:function:: gemm_v1(A, B, C, transpose_A = False, transpose_B = False, policy = GemmWarpPolicy.Square, clear_accum = False, k_pack = 1, wg_wait = 0, mbar = None)
 
-   Perform a General Matrix Multiplication (GEMM) operation.
-
-   This function computes C = A @ B where A and B can optionally be transposed.
-   The operation supports various warp policies and accumulation modes.
-
-   :param A: First input matrix
-   :type A: Union[tir.Buffer, tir.Var]
-   :param B: Second input matrix
-   :type B: Union[tir.Buffer, tir.Var]
-   :param C: Output matrix for results
-   :type C: Union[tir.Buffer, tir.Var]
-   :param transpose_A: Whether to transpose matrix A. Defaults to False.
-   :type transpose_A: bool, optional
-   :param transpose_B: Whether to transpose matrix B. Defaults to False.
-   :type transpose_B: bool, optional
-   :param policy: Warp execution policy. Defaults to GemmWarpPolicy.Square.
-   :type policy: GemmWarpPolicy, optional
-   :param clear_accum: Whether to clear accumulator before computation. Defaults to False.
-   :type clear_accum: bool, optional
-   :param k_pack: Number of k dimensions packed into a single warp. Defaults to 1.
-   :type k_pack: int, optional
-   :param wg_wait: Warp group wait count. Defaults to 0.
-                   On hopper it is equivalent to `wgmma.wait_group.sync.aligned <wg_wait>` if wg_wait is not -1
-                   On sm100, `wg_wait` can only be 0 or -1. `mbarrier_wait(TCGEN5MMA barrier)` will be appended if wg_wait is 0.
-   :type wg_wait: int, optional
-   :param mbar: mbarrier for TCGEN5MMA synchronization
-   :type mbar: tir.Buffer, optional
-
-   :returns: A handle to the GEMM operation
-   :rtype: tir.Call
-
-   :raises AssertionError: If the K dimensions of matrices A and B don't match
+   GEMM v1: use op tl.gemm.
 
 
 .. py:function:: gemm_v2(A, B, C, transpose_A = False, transpose_B = False, policy = GemmWarpPolicy.Square, clear_accum = False, k_pack = 1, wg_wait = 0, mbar = None)
 
-   Perform a General Matrix Multiplication (GEMM) operation.
-
-   This function computes C = A @ B where A and B can optionally be transposed.
-   The operation supports various warp policies and accumulation modes.
-
-   :param A: First input matrix
-   :type A: Union[tir.Buffer, tir.Var]
-   :param B: Second input matrix
-   :type B: Union[tir.Buffer, tir.Var]
-   :param C: Output matrix for results
-   :type C: Union[tir.Buffer, tir.Var]
-   :param transpose_A: Whether to transpose matrix A. Defaults to False.
-   :type transpose_A: bool, optional
-   :param transpose_B: Whether to transpose matrix B. Defaults to False.
-   :type transpose_B: bool, optional
-   :param policy: Warp execution policy. Defaults to GemmWarpPolicy.Square.
-   :type policy: GemmWarpPolicy, optional
-   :param clear_accum: Whether to clear accumulator before computation. Defaults to False.
-   :type clear_accum: bool, optional
-   :param k_pack: Number of k dimensions packed into a single warp. Defaults to 1.
-   :type k_pack: int, optional
-   :param wg_wait: Warp group wait count. Defaults to 0.
-   :type wg_wait: int, optional
-   :param mbar: mbarrier for TCGEN5MMA synchronization
-   :type mbar: tir.Buffer, optional
-
-   :returns: A handle to the GEMM operation
-   :rtype: tir.Call
-
-   :raises AssertionError: If the K dimensions of matrices A and B don't match
+   GEMM v2: use op tl.gemm_py.
 
 
 .. py:data:: gemm
 
@@ -37,7 +37,7 @@ Module Contents
 
 .. py:function:: make_full_bank_swizzled_layout(*args)
 
-   :param args: buffer or (stride, continuous, element_size)
+   :param args: buffer/BufferLoad/BufferRegion or (stride, continuous, element_size)
 
    .. rubric:: Examples
 
@@ -47,7 +47,7 @@ Module Contents
 
 .. py:function:: make_half_bank_swizzled_layout(*args)
 
-   :param args: buffer or (stride, continuous, element_size)
+   :param args: buffer/BufferLoad/BufferRegion or (stride, continuous, element_size)
 
    .. rubric:: Examples
 
@@ -57,7 +57,7 @@ Module Contents
 
 .. py:function:: make_quarter_bank_swizzled_layout(*args)
 
-   :param args: buffer or (stride, continuous, element_size)
+   :param args: buffer/BufferLoad/BufferRegion or (stride, continuous, element_size)
 
    .. rubric:: Examples
 
@@ -67,7 +67,7 @@ Module Contents
 
 .. py:function:: make_linear_layout(*args)
 
-   :param args: buffer or (stride, continuous)
+   :param args: buffer/BufferLoad/BufferRegion or (stride, continuous)
 
    .. rubric:: Examples
 
 
@@ -98,19 +98,13 @@ Module Contents
 
 
 
-   .. py:property:: APtr
-      :type: tvm.tir.PrimExpr
+   .. py:property:: ARegion
 
 
+   .. py:property:: BRegion
 
-   .. py:property:: BPtr
-      :type: tvm.tir.PrimExpr
-
-
-
-   .. py:property:: CPtr
-      :type: tvm.tir.PrimExpr
 
+   .. py:property:: CRegion
 
 
    .. py:property:: stride_A
@@ -161,3 +155,31 @@ Module Contents
    .. py:property:: C_coords
 
 
+   .. py:method:: get_region_base_offsets(region)
+
+      Get the base offset (start index) for each dimension from a BufferRegion.
+
+      For example, if region is A_shared[ko % 2, 0:128, 0:64],
+      this returns [ko % 2, 0, 0]
+
+      :param region: BufferRegion object
+
+      :returns: List of PrimExpr representing the base offset for each dimension
+
+
+
+   .. py:property:: A_base_offsets
+
+      Get base offsets for each dimension of A region
+
+
+   .. py:property:: B_base_offsets
+
+      Get base offsets for each dimension of B region
+
+
+   .. py:property:: C_base_offsets
+
+      Get base offsets for each dimension of C region
+
+
@@ -94,80 +94,58 @@ Package Contents
    Bases: :py:obj:`tvm.ir.base.Node`, :py:obj:`tvm.runtime.Scriptable`
 
 
-   .. py:attribute:: A
-      :type:  tvm.tir.Buffer
+   .. py:property:: A
 
 
-   .. py:attribute:: B
-      :type:  tvm.tir.Buffer
+   .. py:property:: B
 
 
-   .. py:attribute:: C
-      :type:  tvm.tir.Buffer
+   .. py:property:: C
 
 
-   .. py:attribute:: APtr
-      :type:  tvm.tir.PrimExpr
+   .. py:property:: APtr
 
 
-   .. py:attribute:: BPtr
-      :type:  tvm.tir.PrimExpr
+   .. py:property:: BPtr
 
 
-   .. py:attribute:: CPtr
-      :type:  tvm.tir.PrimExpr
+   .. py:property:: CPtr
 
 
-   .. py:attribute:: M
-      :type:  int
+   .. py:property:: M
 
 
-   .. py:attribute:: N
-      :type:  int
+   .. py:property:: N
 
 
-   .. py:attribute:: K
-      :type:  int
+   .. py:property:: K
 
 
-   .. py:attribute:: trans_A
-      :type:  bool
+   .. py:property:: trans_A
 
 
-   .. py:attribute:: trans_B
-      :type:  bool
+   .. py:property:: trans_B
 
 
-   .. py:attribute:: stride_A
-      :type:  int
+   .. py:property:: stride_A
 
 
-   .. py:attribute:: stride_B
-      :type:  int
+   .. py:property:: stride_B
 
 
-   .. py:attribute:: offset_A
-      :type:  int
+   .. py:property:: offset_A
 
 
-   .. py:attribute:: offset_B
-      :type:  int
+   .. py:property:: offset_B
 
 
-   .. py:attribute:: clear_accum
-      :type:  bool
+   .. py:property:: clear_accum
 
 
-   .. py:attribute:: k_pack
-      :type:  int
+   .. py:property:: k_pack
 
 
-   .. py:attribute:: wg_wait
-      :type:  int
-
-
-   .. py:attribute:: policy
-      :type:  tilelang.ir.GemmWarpPolicy
+   .. py:property:: wg_wait
 
 
    .. py:method:: infer_layout(target, thread_nums)