[TMA] Enable unswizzled tma layouts (#6238)

peterbell10 · web-flow · commit 3887b805edb4 · 2025-03-22T20:55:05.000Z
This enables tma support for smaller loaded blocks by falling back to unswizzled encodings where necessary. We are also careful to propagate the shape info from gather/scatter instructions so these can still enable swizzling where possible.
diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
@@ -430,7 +430,7 @@ def NVMMASharedEncodingAttr :
         } else if (contigDimSizeInByte >= 32 && contigDimSizeInByte % 32 == 0) {
           swizzlingByteWidth = 32;
         } else {
-          llvm_unreachable("unsupported shared memory layout for MMAv3");
+          llvm_unreachable("unsupported NVMMA layout (MMAv3 or TMA)");
         }
         bool transposed = order[0] == 0;
         return $_get(context, swizzlingByteWidth, transposed, eleBitWidth, fp4Padded, CTALayout);
diff --git a/include/triton/Dialect/TritonNvidiaGPU/Transforms/TMAUtilities.h b/include/triton/Dialect/TritonNvidiaGPU/Transforms/TMAUtilities.h
@@ -156,11 +156,16 @@ mlir::LogicalResult createTMADesc(mlir::Value tmaPtr,
              "elem type .b4x16_p64 supports only 128B swizzling");
     }
   } else {
-    op->emitError() << "Unhandled encoding type";
-    return failure();
+    auto swizzledEnc = dyn_cast<gpu::SwizzledSharedEncodingAttr>(
+        op.getType().getBlockType().getEncoding());
+    if (!swizzledEnc || swizzledEnc.getVec() != 1 ||
+        swizzledEnc.getPerPhase() != 1 || swizzledEnc.getMaxPhase() != 1) {
+      op->emitError() << "Unhandled encoding type";
+      return failure();
+    }
   }
 
-  int32_t swizzle_mode;
+  int32_t swizzle_mode = 0;
   if (swizzleBytes == 128) {
     swizzle_mode = 3;
   } else if (swizzleBytes == 64) {
diff --git a/lib/Dialect/TritonNvidiaGPU/Transforms/OptimizeDescriptorEncoding.cpp b/lib/Dialect/TritonNvidiaGPU/Transforms/OptimizeDescriptorEncoding.cpp
@@ -1,22 +1,16 @@
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/Pass/PassManager.h"
-#include "mlir/Transforms/Passes.h"
-#include "triton/Analysis/AxisInfo.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 #include "triton/Dialect/Triton/IR/Types.h"
 #include "triton/Dialect/TritonGPU/IR/Attributes.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/IR/TritonGPUInterfaces.h"
-#include "triton/Dialect/TritonGPU/Transforms/Passes.h"
 #include "triton/Dialect/TritonGPU/Transforms/Utility.h"
 #include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
 #include "triton/Dialect/TritonNvidiaGPU/Transforms/Passes.h"
 #include "triton/Dialect/TritonNvidiaGPU/Transforms/TMAUtilities.h"
-#include "triton/Tools/Sys/GetEnv.hpp"
 #include "llvm/ADT/PriorityWorklist.h"
-#include "llvm/ADT/Sequence.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/VersionTuple.h"
+#include <algorithm>
 #include <memory>
 #include <unordered_set>
 
@@ -35,6 +29,7 @@ struct UseInfo {
   TypedValue<tt::TensorDescType> descriptor;
   Operation *use;
   Attribute desiredSharedEncoding;
+  SmallVector<int64_t> shape;
   ttg::CTALayoutAttr ctaLayout;
 };
 
@@ -72,6 +67,14 @@ ttg::CTALayoutAttr getCtaLayoutFromEncoding(Attribute encoding) {
                                  layout.getCTASplitNum(), layout.getCTAOrder());
 }
 
+SmallVector<int64_t> expandToRank(ArrayRef<int64_t> shape, int rank) {
+  SmallVector<int64_t> result(rank, 1);
+  assert(shape.size() <= rank);
+  auto rankDiff = rank - shape.size();
+  std::copy(shape.begin(), shape.end(), result.begin() + rankDiff);
+  return result;
+}
+
 std::optional<UseInfo> getUseInfo(Operation *op) {
   UseInfo info;
   info.use = op;
@@ -81,6 +84,9 @@ std::optional<UseInfo> getUseInfo(Operation *op) {
     auto encoding = info.desiredSharedEncoding ? info.desiredSharedEncoding
                                                : load.getType().getEncoding();
     info.ctaLayout = ttg::getCTALayout(encoding);
+    auto shape = load.getResult().getType().getShape();
+    auto rank = load.getDesc().getType().getBlockType().getRank();
+    info.shape = expandToRank(shape, rank);
     return info;
   }
   if (auto gather = dyn_cast<tt::DescriptorGatherOp>(op)) {
@@ -89,18 +95,27 @@ std::optional<UseInfo> getUseInfo(Operation *op) {
     auto encoding = info.desiredSharedEncoding ? info.desiredSharedEncoding
                                                : gather.getType().getEncoding();
     info.ctaLayout = ttg::getCTALayout(encoding);
+    auto shape = gather.getResult().getType().getShape();
+    auto rank = gather.getDesc().getType().getBlockType().getRank();
+    info.shape = expandToRank(shape, rank);
     return info;
   }
   if (auto store = dyn_cast<tt::DescriptorStoreOp>(op)) {
     info.descriptor = store.getDesc();
     auto encoding = store.getSrc().getType().getEncoding();
     info.ctaLayout = ttg::getCTALayout(encoding);
+    auto shape = store.getSrc().getType().getShape();
+    auto rank = store.getDesc().getType().getBlockType().getRank();
+    info.shape = expandToRank(shape, rank);
     return info;
   }
   if (auto scatter = dyn_cast<tt::DescriptorScatterOp>(op)) {
     info.descriptor = scatter.getDesc();
     auto encoding = scatter.getSrc().getType().getEncoding();
     info.ctaLayout = ttg::getCTALayout(encoding);
+    auto shape = scatter.getSrc().getType().getShape();
+    auto rank = scatter.getDesc().getType().getBlockType().getRank();
+    info.shape = expandToRank(shape, rank);
     return info;
   }
   return std::nullopt;
@@ -109,12 +124,15 @@ std::optional<UseInfo> getUseInfo(Operation *op) {
 struct EncodingInfo {
   Attribute desiredEncoding;
   ttg::CTALayoutAttr ctaLayout;
+  // Shape may be different from the descriptor block shape for gather/scatter
+  // use case
+  SmallVector<int64_t> shape;
   bool forcedToDefault = false;
 
   bool operator==(const EncodingInfo &other) const {
     return desiredEncoding == other.desiredEncoding &&
            ctaLayout == other.ctaLayout &&
-           forcedToDefault == other.forcedToDefault;
+           forcedToDefault == other.forcedToDefault && shape == other.shape;
   }
 };
 
@@ -123,7 +141,8 @@ struct EncodingInfo {
 template <> struct std::hash<EncodingInfo> {
   size_t operator()(const EncodingInfo &einfo) const {
     return llvm::hash_combine(einfo.desiredEncoding, einfo.ctaLayout,
-                              einfo.forcedToDefault);
+                              einfo.forcedToDefault,
+                              ArrayRef<int64_t>(einfo.shape));
   }
 };
 
@@ -172,6 +191,21 @@ EncodingInfo combineEncodings(const EncodingInfo &lhs, const EncodingInfo &rhs,
   // Always propagate forcedToDefault
   result.forcedToDefault = lhs.forcedToDefault || rhs.forcedToDefault;
 
+  if (result.forcedToDefault)
+    return result;
+
+  if (lhs.shape.empty() || lhs.shape == rhs.shape)
+    result.shape = rhs.shape;
+  else if (rhs.shape.empty())
+    result.shape = lhs.shape;
+  else {
+    assert(lhs.shape.size() == rhs.shape.size());
+    auto rank = lhs.shape.size();
+    result.shape.reserve(rank);
+    for (int i = 0; i < rank; ++i)
+      result.shape.push_back(std::min(lhs.shape[i], rhs.shape[i]));
+  }
+
   SetVector<ttg::CTALayoutAttr> ctaLayouts;
   if (lhs.ctaLayout)
     ctaLayouts.insert(lhs.ctaLayout);
@@ -190,9 +224,6 @@ EncodingInfo combineEncodings(const EncodingInfo &lhs, const EncodingInfo &rhs,
     break;
   }
 
-  if (result.forcedToDefault)
-    return result;
-
   SetVector<Attribute> desiredEncodings;
   if (lhs.desiredEncoding)
     desiredEncodings.insert(lhs.desiredEncoding);
@@ -213,23 +244,32 @@ EncodingInfo combineEncodings(const EncodingInfo &lhs, const EncodingInfo &rhs,
 }
 
 Attribute getFallbackSharedEncoding(RankedTensorType tensorType,
-                                    ttg::CTALayoutAttr ctaLayout) {
+                                    ttg::CTALayoutAttr ctaLayout,
+                                    ArrayRef<int64_t> usageShape) {
   auto ctx = tensorType.getContext();
   SmallVector<unsigned> order;
   for (int i = tensorType.getRank() - 1; i >= 0; --i)
     order.push_back(i);
 
+  ArrayRef<int64_t> shape =
+      usageShape.empty() ? tensorType.getShape() : usageShape;
   if (!ctaLayout)
     ctaLayout = ttg::CTALayoutAttr::getDefault(ctx, tensorType.getRank());
   else if (ctaLayout.getRank() != tensorType.getRank())
-    ctaLayout = ttng::updateCTALayoutForShape(ctaLayout, tensorType.getShape());
+    ctaLayout = ttng::updateCTALayoutForShape(ctaLayout, shape);
+
+  auto elemTy = tensorType.getElementType();
+  auto shapePerCTA = ttg::getShapePerCTA(ctaLayout.getCTASplitNum(), shape);
+  unsigned eleBitWidth = tensorType.getElementType().getIntOrFloatBitWidth();
 
-  if (tensorType.getRank() == 1) {
+  auto contigDimSizeInBytes = shapePerCTA.back() * eleBitWidth / 8;
+  auto rank = tensorType.getRank();
+  if (rank == 1 || contigDimSizeInBytes < 32 || shape[rank - 2] < 8) {
     return ttg::SwizzledSharedEncodingAttr::get(ctx, 1, 1, 1, order, ctaLayout);
   }
-  return ttg::NVMMASharedEncodingAttr::get(
-      ctx, tensorType.getShape(), order, ctaLayout, tensorType.getElementType(),
-      /*fp4Padded*/ false);
+  return ttg::NVMMASharedEncodingAttr::get(ctx, shape, order, ctaLayout,
+                                           tensorType.getElementType(),
+                                           /*fp4Padded*/ false);
 }
 
 tt::TensorDescType getTensorDescTypeWithEncoding(Operation *op,
@@ -274,17 +314,19 @@ void assignMemoryLayouts(tt::FuncOp &func) {
   // fallback to default encoding
   for (auto blockArg : func.getBlocks().front().getArguments())
     if (auto desc = dyn_cast<TypedValue<tt::TensorDescType>>(blockArg))
-      updateEncoding({desc}, EncodingInfo{{}, {}, /*forcedToDefault=*/true});
+      updateEncoding({desc},
+                     EncodingInfo{{}, {}, {}, /*forcedToDefault=*/true});
 
   func.walk([&](Operation *op) {
     if (auto info = getUseInfo(op)) {
-      updateEncoding(info->descriptor, EncodingInfo{info->desiredSharedEncoding,
-                                                    info->ctaLayout});
+      updateEncoding(info->descriptor,
+                     EncodingInfo{info->desiredSharedEncoding, info->ctaLayout,
+                                  info->shape});
     } else {
       bool forcedToDefault =
           isa<tt::CallOp, tt::ReturnOp, tt::ReinterpretTensorDescOp>(op);
       auto einfo =
-          internEncoding(encodings, EncodingInfo{{}, {}, forcedToDefault});
+          internEncoding(encodings, EncodingInfo{{}, {}, {}, forcedToDefault});
 
       auto setEncoding = [&](Value v) {
         auto typedVal = cast<TypedValue<tt::TensorDescType>>(v);
@@ -344,9 +386,10 @@ void assignMemoryLayouts(tt::FuncOp &func) {
     if (einfo->desiredEncoding) {
       newEncoding = einfo->desiredEncoding;
     } else if (einfo->forcedToDefault) {
-      newEncoding = getFallbackSharedEncoding(existingTy, {});
+      newEncoding = getFallbackSharedEncoding(existingTy, {}, {});
     } else {
-      newEncoding = getFallbackSharedEncoding(existingTy, einfo->ctaLayout);
+      newEncoding =
+          getFallbackSharedEncoding(existingTy, einfo->ctaLayout, einfo->shape);
     }
     desc.setType(getTensorDescTypeWithEncoding(desc.getDefiningOp(), existingTy,
                                                newEncoding));
@@ -356,14 +399,14 @@ void assignMemoryLayouts(tt::FuncOp &func) {
   SmallVector<Type> resultTys(func.getResultTypes());
   for (auto [i, argTy] : llvm::enumerate(argTys)) {
     if (auto descTy = dyn_cast<tt::TensorDescType>(argTy)) {
-      auto encoding = getFallbackSharedEncoding(descTy.getBlockType(), {});
+      auto encoding = getFallbackSharedEncoding(descTy.getBlockType(), {}, {});
       argTys[i] = getTensorDescTypeWithEncoding(nullptr, descTy.getBlockType(),
                                                 encoding);
     }
   }
   for (auto [i, resultTy] : llvm::enumerate(resultTys)) {
     if (auto descTy = dyn_cast<tt::TensorDescType>(resultTy)) {
-      auto encoding = getFallbackSharedEncoding(descTy.getBlockType(), {});
+      auto encoding = getFallbackSharedEncoding(descTy.getBlockType(), {}, {});
       resultTys[i] = getTensorDescTypeWithEncoding(
           nullptr, descTy.getBlockType(), encoding);
     }
diff --git a/python/test/unit/cuda/test_tensor_descriptor.py b/python/test/unit/cuda/test_tensor_descriptor.py
@@ -11,7 +11,8 @@
 @requires_tma
 @pytest.mark.interpreter
 @pytest.mark.parametrize("dtype_str", tma_dtypes)
-def test_tensor_descriptor_load(dtype_str):
+@pytest.mark.parametrize("M_BLOCK,N_BLOCK", [(2, 16), (8, 16), (8, 32)])
+def test_tensor_descriptor_load(dtype_str, M_BLOCK, N_BLOCK):
 
     @triton.jit
     def kernel(out_ptr, a_ptr, M, N, M_BLOCK: tl.constexpr, N_BLOCK: tl.constexpr):
@@ -41,9 +42,6 @@ def alloc_fn(size: int, align: int, stream: Optional[int]):
 
     M, N = 32, 128
     inp = to_triton(numpy_random((M, N), dtype_str), device="cuda", dst_type=dtype_str)
-
-    M_BLOCK = 8
-    N_BLOCK = 32
     out = inp.new_empty((M_BLOCK, N_BLOCK))
 
     kernel[(1, )](out, inp, M, N, M_BLOCK, N_BLOCK)
@@ -55,7 +53,8 @@ def alloc_fn(size: int, align: int, stream: Optional[int]):
 @requires_tma
 @pytest.mark.interpreter
 @pytest.mark.parametrize("dtype_str", tma_dtypes)
-def test_tensor_descriptor_store(dtype_str):
+@pytest.mark.parametrize("M_BLOCK,N_BLOCK", [(2, 16), (8, 16), (8, 32)])
+def test_tensor_descriptor_store(dtype_str, M_BLOCK, N_BLOCK):
 
     @triton.jit
     def kernel(out_ptr, a_ptr, M, N, M_BLOCK: tl.constexpr, N_BLOCK: tl.constexpr):
@@ -84,9 +83,6 @@ def kernel(out_ptr, a_ptr, M, N, M_BLOCK: tl.constexpr, N_BLOCK: tl.constexpr):
 
     M, N = 32, 128
     inp = to_triton(numpy_random((M, N), dtype_str), device="cuda", dst_type=dtype_str)
-
-    M_BLOCK = 8
-    N_BLOCK = 32
     out = inp.new_empty((M, N))
 
     grid_m = M // M_BLOCK
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
@@ -4974,31 +4974,31 @@ def test_tma_load_block_shape_err(device):
 
     @triton.jit
     def kernel(ptr):
-        desc = tl.make_tensor_descriptor(ptr, [128, 128], [128, 1], [1, 32])
+        desc = tl.make_tensor_descriptor(ptr, [128, 128], [128, 1], [1, 2])
         desc.load([0, 0])
 
     input = torch.empty((128, 128), dtype=torch.int32, device=device)
     errc = triton.CompilationError if not is_interpreter() else InterpreterError
     with pytest.raises(errc) as e:
         kernel[(1, )](input)
 
-    assert "tensor descriptor block shape must have at least 8 rows" in str(e.value.__cause__)
+    assert "Descriptor block shape must have at least 16 bytes" in str(e.value.__cause__)
 
 
 @pytest.mark.interpreter
 def test_tma_store_block_shape_err(device):
 
     @triton.jit
     def kernel(ptr):
-        desc = tl.make_tensor_descriptor(ptr, [128, 128], [128, 1], [8, 8])
-        desc.store([0, 0], tl.zeros((1, 32), dtype=tl.int16))
+        desc = tl.make_tensor_descriptor(ptr, [128, 128], [128, 1], [8, 4])
+        desc.store([0, 0], tl.zeros([8, 4], dtype=tl.int16))
 
     input = torch.empty((128, 128), dtype=torch.int16, device=device)
     errc = triton.CompilationError if not is_interpreter() else InterpreterError
     with pytest.raises(errc) as e:
         kernel[(1, )](input)
 
-    assert "int16 tensor descriptor block shape must have at least 16 columns" in str(e.value.__cause__)
+    assert "Descriptor block shape must have at least 16 bytes" in str(e.value.__cause__)
 
 
 def test_trans_reshape(device):
diff --git a/python/triton/language/semantic.py b/python/triton/language/semantic.py
@@ -1152,21 +1152,9 @@ def reinterpret_tensor_descriptor(desc_ptr: tl.tensor, block_ty: tl.block_type,
     return tl.tensor_descriptor_base(handle, block_ty)
 
 
-def validate_descriptor_block(shape, dtype):
-    if len(shape) != 2:
-        return
-    # Due to limitations of the shared memory encoding, the TMA bounding box has
-    # to be at least as big as the swizzle tile.
-    assert shape[0] >= 8, f"tensor descriptor block shape must have at least 8 rows, but got {shape[0]}"
-    min_cols = 32 // dtype.primitive_bitwidth * 8
-    assert shape[
-        1] >= min_cols, f"{dtype} tensor descriptor block shape must have at least {min_cols} columns, but got {shape[1]}"
-
-
 def descriptor_load(desc: tl._experimental_tensor_desciptor_base, offsets, cache_modifier: str, eviction_policy: str,
                     builder: ir.builder) -> tl.tensor:
     assert isinstance(desc, tl.tensor_descriptor_base)
-    validate_descriptor_block(desc.block_shape, desc.dtype)
     ndim = len(desc.block_shape)
     assert len(offsets) == ndim, f"expected {ndim} offsets, but got {len(offsets)}"
 
@@ -1178,7 +1166,6 @@ def descriptor_load(desc: tl._experimental_tensor_desciptor_base, offsets, cache
 
 def descriptor_store(desc: tl.tensor_descriptor_base, value: tl.tensor, offsets, builder: ir.builder) -> tl.tensor:
     assert isinstance(desc, tl.tensor_descriptor_base)
-    validate_descriptor_block(desc.block_shape, desc.dtype)
     ndim = len(desc.block_shape)
     assert len(offsets) == ndim, f"expected {ndim} offsets, but got {len(offsets)}"
     assert value.shape == desc.block_shape
@@ -1931,6 +1918,13 @@ def make_tensor_descriptor(
         raise ValueError(f"Expected {ndim} strides but got {len(strides)}")
     if len(block_shape) != ndim:
         raise ValueError(f"Expected block_shape to have {ndim} dimensions but got {len(strides)}")
+    assert isinstance(base.dtype, tl.pointer_type)
+    elem_size = base.dtype.element_ty.primitive_bitwidth // 8
+    contig_dim_size = tl._constexpr_to_value(block_shape[-1])
+    if contig_dim_size * elem_size < 16:
+        raise ValueError(
+            f"Descriptor block shape must have at least 16 bytes in the last dimension, but got {contig_dim_size} * {elem_size} = {contig_dim_size * elem_size} bytes"
+        )
 
     strides[-1] = tl._constexpr_to_value(strides[-1])
     if strides[-1] != 1:
diff --git a/test/TritonNvidiaGPU/optimize_descriptor_encoding.mlir b/test/TritonNvidiaGPU/optimize_descriptor_encoding.mlir
diff --git a/third_party/nvidia/backend/driver.c b/third_party/nvidia/backend/driver.c

Original file line number	Diff line number	Diff line change
`@@ -430,7 +430,7 @@ def NVMMASharedEncodingAttr :`
`430`	`430`	`} else if (contigDimSizeInByte >= 32 && contigDimSizeInByte % 32 == 0) {`
`431`	`431`	`swizzlingByteWidth = 32;`
`432`	`432`	`} else {`
`433`		`- llvm_unreachable("unsupported shared memory layout for MMAv3");`
	`433`	`+ llvm_unreachable("unsupported NVMMA layout (MMAv3 or TMA)");`
`434`	`434`	`}`
`435`	`435`	`bool transposed = order[0] == 0;`
`436`	`436`	`return $_get(context, swizzlingByteWidth, transposed, eleBitWidth, fp4Padded, CTALayout);`