[BACKEND] Add support for LinearSharedEncoding (triton-lang#8116)

lezcano · web-flow · commit 8a5d1eebf7a2 · 2025-09-09T15:46:45.000+01:00
For now we just add it to the gluon test.
diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
@@ -493,6 +493,36 @@ For identity mappings a short form based on order and shape is used to increase
   let genVerifyDecl = 1;
 }
 
+def SharedLinearEncodingAttr
+    : TritonGPU_Attr<"SharedLinearEncoding", "shared_linear_encoding",
+                     [SharedEncodingTrait, DeclareLayoutEncodingMethods]> {
+  let mnemonic = "shared_linear";
+
+  let description = [{
+    Linear shared encodings mirror LinearEncodingAttr but operate on shared
+    memory layouts. The LinearLayout parameter captures how shared memory
+    offsets (and optionally blocks) map to logical tensor indices.
+  }];
+
+  let parameters = (ins LinearLayoutParam:$linearLayout, "unsigned":$layoutAlignment);
+
+  let extraClassDeclaration = [{
+    SmallVector<unsigned> basesPerDim(StringAttr dimName,
+                                      bool skipBroadcast = true) const;
+    SmallVector<unsigned> orderPerDim(StringAttr dimName,
+                                      ArrayRef<unsigned> defaultOrder) const;
+
+    SmallVector<unsigned> getOrder() const;
+
+    LinearLayout toLinearLayout(ArrayRef<int64_t> shape) const;
+
+    int32_t getAlignment() const { return static_cast<int32_t>(getLayoutAlignment()); }
+  }];
+
+  let genVerifyDecl = 1;
+  let hasCustomAssemblyFormat = 1;
+}
+
 def NVMMASharedEncodingAttr : TritonGPU_Attr<"NVMMASharedEncoding", "nvmma_shared_encoding", [DeclareSharedEncodingMethods, LayoutEncodingTrait]> {
   let mnemonic = "nvmma_shared";
 
diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
@@ -187,6 +187,9 @@ SmallVector<unsigned> getOrder(SharedEncodingTrait layout,
   if (auto paddedEnc = dyn_cast<PaddedSharedEncodingAttr>(layout)) {
     return paddedEnc.getOrder();
   }
+  if (auto linearEnc = dyn_cast<SharedLinearEncodingAttr>(layout)) {
+    return linearEnc.getOrder();
+  }
   if (auto sharedLayout = dyn_cast<NVMMASharedEncodingAttr>(layout)) {
     if (shape.size() == 1) {
       return {0};
@@ -1566,6 +1569,181 @@ void SwizzledSharedEncodingAttr::print(AsmPrinter &printer) const {
   printer << "}>";
 }
 
+//===----------------------------------------------------------------------===//
+// SharedLinear encoding
+//===----------------------------------------------------------------------===//
+
+LogicalResult
+SharedLinearEncodingAttr::verify(function_ref<InFlightDiagnostic()> emitError,
+                                 LinearLayout linearLayout,
+                                 unsigned layoutAlignment) {
+  if (layoutAlignment == 0 || !llvm::isPowerOf2_32(layoutAlignment)) {
+    return emitError() << "alignment must be a positive power of two";
+  }
+  static const auto expectedInDims =
+      SmallVector<std::string>({"offset", "block"});
+  for (const auto &[index, dims] : llvm::enumerate(
+           llvm::zip(linearLayout.getInDimNames(), expectedInDims))) {
+    const auto &[dim, expected] = dims;
+    if (dim.str() != expected) {
+      return emitError() << "Expected input dimension " << index << " to be '"
+                         << expected << "'. Got " << dim;
+    }
+  }
+
+  for (auto [i, dim] : llvm::enumerate(linearLayout.getOutDimNames())) {
+    if (dim.str() != ("dim" + llvm::Twine(i)).str()) {
+      return emitError()
+             << "Expected output dimensions to be ['dim0', 'dim1', ...]. Got "
+             << dim << " at position " << i;
+    }
+  }
+
+  SmallVector<StringAttr> outDimNames =
+      llvm::to_vector(linearLayout.getOutDimNames());
+  if (outDimNames.empty()) {
+    return emitError()
+           << "SharedLinearEncodingAttr requires at least one output"
+              " dimension.";
+  }
+
+  auto *ctx = outDimNames.front().getContext();
+  auto kOffset = StringAttr::get(ctx, "offset");
+  auto kBlock = StringAttr::get(ctx, "block");
+
+  if (!linearLayout.isSurjective()) {
+    return emitError() << "The layout must be surjective";
+  }
+
+  LinearLayout withoutBroadcast =
+      linearLayout.removeZeroBasesAlongDim(kOffset).removeZeroBasesAlongDim(
+          kBlock);
+  if (!withoutBroadcast.isInvertible()) {
+    return emitError()
+           << "After removing the zero bases the layout must be bijective";
+  }
+
+  return success();
+}
+
+void SharedLinearEncodingAttr::print(AsmPrinter &printer) const {
+  printer << "<{";
+  auto layout = getLinearLayout();
+  auto kBlock = StringAttr::get(getContext(), "block");
+  auto kOffset = StringAttr::get(getContext(), "offset");
+  if (layout.getBases().lookup(kBlock).empty()) {
+    layout =
+        layout.sublayout({kOffset}, llvm::to_vector(layout.getOutDimNames()));
+  }
+  printLinearLayout(printer, layout);
+  printer << "}, alignment = " << getAlignment() << "}>";
+}
+
+Attribute SharedLinearEncodingAttr::parse(AsmParser &parser, Type type) {
+  if (parser.parseLess().failed())
+    return {};
+
+  DictionaryAttr layoutDictRaw;
+  if (parser.parseAttribute(layoutDictRaw).failed())
+    return {};
+
+  if (layoutDictRaw.get("alignment")) {
+    parser.emitError(parser.getCurrentLocation())
+        << "alignment must be specified outside of the linear layout braces";
+    return {};
+  }
+
+  NamedAttrList layoutAttrList(layoutDictRaw.getValue());
+  auto *ctx = parser.getContext();
+  auto kBlock = StringAttr::get(ctx, "block");
+  if (!layoutAttrList.get(kBlock)) {
+    layoutAttrList.push_back({kBlock, ArrayAttr::get(ctx, {})});
+  }
+
+  DictionaryAttr layoutDict = layoutAttrList.getDictionary(ctx);
+
+  // Parse alignment
+  unsigned layoutAlignment;
+  if (parser.parseComma().failed())
+    return {};
+  if (parser.parseKeyword("alignment").failed() || parser.parseEqual().failed())
+    return {};
+  if (parser.parseInteger(layoutAlignment).failed())
+    return {};
+
+  if (parser.parseGreater().failed())
+    return {};
+
+  std::vector<std::string> inDimNames = {"offset", "block"};
+  auto maybeLL = parseLinearLayout(layoutDict, parser, inDimNames);
+  if (!maybeLL.has_value())
+    return {};
+
+  // Special case for cleaner errors
+  if (layoutDict.get("alignment")) {
+    parser.emitError(parser.getCurrentLocation())
+        << "alignment must be specified outside of the linear layout braces";
+    return {};
+  }
+
+  if (layoutDict.size() != 2) {
+    parser.emitError(parser.getCurrentLocation())
+        << "SharedLinearEncodingAttr must have exactly two attributes: offset "
+           "and block";
+    return {};
+  }
+
+  return parser.getChecked<SharedLinearEncodingAttr>(
+      parser.getContext(), std::move(*maybeLL), layoutAlignment);
+}
+
+SmallVector<unsigned>
+SharedLinearEncodingAttr::basesPerDim(StringAttr dimName,
+                                      bool skipBroadcast) const {
+  auto ll = getLinearLayout();
+  auto rank = ll.getNumOutDims();
+  return basesPerDimImpl(ll.getBases(), dimName, rank, skipBroadcast);
+}
+
+SmallVector<unsigned>
+SharedLinearEncodingAttr::orderPerDim(StringAttr dimName,
+                                      ArrayRef<unsigned> defaultOrder) const {
+  return orderPerDimImpl(getLinearLayout(), dimName, defaultOrder);
+}
+
+SmallVector<unsigned> SharedLinearEncodingAttr::getOrder() const {
+  auto ll = getLinearLayout();
+  auto rank = ll.getNumOutDims();
+  SmallVector<unsigned> defaultOrder(rank);
+  std::iota(defaultOrder.rbegin(), defaultOrder.rend(), 0);
+  return orderPerDim(StringAttr::get(getContext(), "offset"), defaultOrder);
+}
+
+SmallVector<unsigned> SharedLinearEncodingAttr::getCTAsPerCGA() const {
+  return basesPerDim(StringAttr::get(getContext(), "block"),
+                     /*skipBroadcast=*/false);
+}
+
+SmallVector<unsigned> SharedLinearEncodingAttr::getCTAOrder() const {
+  return orderPerDim(StringAttr::get(getContext(), "block"), getOrder());
+}
+
+SmallVector<unsigned> SharedLinearEncodingAttr::getCTASplitNum() const {
+  return basesPerDim(StringAttr::get(getContext(), "block"));
+}
+
+LinearLayout
+SharedLinearEncodingAttr::toLinearLayout(ArrayRef<int64_t> shape) const {
+  auto ll = getLinearLayout();
+  auto outDimNames = llvm::to_vector(ll.getOutDimNames());
+  assert(shape.size() == outDimNames.size());
+  // We don't support automatic broadcasting for shared linear layouts
+  for (auto [size, llSize] : llvm::zip(shape, ll.getOutDimSizes())) {
+    assert(size == llSize);
+  }
+  return ll;
+}
+
 //===----------------------------------------------------------------------===//
 // PaddedShared encoding
 //===----------------------------------------------------------------------===//
diff --git a/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp b/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp
@@ -1253,6 +1253,8 @@ LinearLayout TritonGPUDialect::toLinearLayout(ArrayRef<int64_t> shape,
            "shape must be a postive power of 2");
     if (auto shared = dyn_cast<SwizzledSharedEncodingAttr>(layout)) {
       result = swizzledSharedToLinearLayout(shape, shared);
+    } else if (auto shared = dyn_cast<SharedLinearEncodingAttr>(layout)) {
+      result = shared.toLinearLayout(shape);
     } else if (auto shared = dyn_cast<NVMMASharedEncodingAttr>(layout)) {
       result = nvmmaSharedToLinearLayout(shape, shared);
     } else if (auto sbl = dyn_cast<AMDRotatingSharedEncodingAttr>(layout)) {
diff --git a/python/src/gluon_ir.cc b/python/src/gluon_ir.cc
@@ -395,6 +395,20 @@ void init_gluon_ir(py::module &&m) {
              return ttg::PaddedSharedEncodingAttr::get(ctx, intervals, paddings,
                                                        ll);
            })
+      .def("get_shared_linear_layout",
+           [](GluonOpBuilder &self, std::vector<std::vector<int>> &offsetBases,
+              std::vector<std::vector<int>> &blockBases,
+              std::vector<int64_t> &shape, unsigned alignment) -> Attribute {
+             auto ctx = self.getContext();
+             auto kOffset = mlir::StringAttr::get(ctx, "offset");
+             auto kBlock = mlir::StringAttr::get(ctx, "block");
+             auto ll = tt::LinearLayout(
+                 {{kOffset, offsetBases}, {kBlock, blockBases}},
+                 tt::standardOutDimPairs(ctx, shape),
+                 /*requireSurjective=*/true);
+             return self.getChecked<ttg::SharedLinearEncodingAttr>(ctx, ll,
+                                                                   alignment);
+           })
       .def("get_nvmma_shared_layout",
            [](GluonOpBuilder &self, unsigned swizzleByteWidth,
               unsigned elementBitwidth, bool transposed, bool fp4Padded,
diff --git a/python/test/gluon/test_lowerings.py b/python/test/gluon/test_lowerings.py
@@ -721,6 +721,7 @@ def kernel(x_ptr, y_ptr, M: ttgl.constexpr, N: ttgl.constexpr, src_layout: ttgl.
     ttgl.SwizzledSharedLayout(vec=4, per_phase=2, max_phase=4, order=[0, 1]),
     ttgl.SwizzledSharedLayout(vec=8, per_phase=1, max_phase=8, order=[1, 0]),
     ttgl.SwizzledSharedLayout(vec=16, per_phase=1, max_phase=16, order=[1, 0]),
+    "shared_linear_layout",
 ])
 
 
@@ -733,6 +734,20 @@ def kernel(x_ptr, y_ptr, M: ttgl.constexpr, N: ttgl.constexpr, src_layout: ttgl.
 @pytest.mark.parametrize("dist_layout", _ld_st_dot_layouts + _ld_st_mma_layouts)
 @pytest.mark.parametrize("shared_layout", _ld_st_shared_layouts)
 def test_local_load_store_2d_layouts(shape, dtype, dist_layout, shared_layout, device):
+    if shared_layout == "shared_linear_layout":
+        rank = len(shape)
+        assert rank == 2
+        offset_bases = []
+        for dim, size in enumerate(shape):
+            assert size > 0 and (size & (size - 1)) == 0
+            stride = 1
+            while stride < size:
+                basis = [0] * rank
+                basis[dim] = stride
+                offset_bases.append(basis)
+                stride <<= 1
+        shared_layout = ttgl.SharedLinearLayout(offset_bases=offset_bases, block_bases=[], shape=list(shape))
+
     if isinstance(shared_layout, ttgl.NVMMASharedLayout):
         contig_dim = 0 if shared_layout.transposed else 1
         if shape[contig_dim] < (8 * shared_layout.swizzle_byte_width) / shared_layout.element_bitwidth:
diff --git a/python/triton/experimental/gluon/language/__init__.py b/python/triton/experimental/gluon/language/__init__.py
@@ -83,6 +83,7 @@
     NVMMASharedLayout,
     SwizzledSharedLayout,
     PaddedSharedLayout,
+    SharedLinearLayout,
 )
 from ._math import (
     umulhi,
diff --git a/python/triton/experimental/gluon/language/_layouts.py b/python/triton/experimental/gluon/language/_layouts.py
@@ -587,6 +587,45 @@ def __hash__(self):
                      tuple(map(tuple, self.block_bases)), tuple(self.shape)))
 
 
+@dataclass(frozen=True)
+class SharedLinearLayout(SharedLayout):
+    """Represents a shared memory layout defined via an explicit LinearLayout."""
+
+    offset_bases: List[List[int]]
+    block_bases: List[List[int]]
+    shape: List[int]
+    alignment: int = 16
+
+    def __post_init__(self):
+        super().__setattr__("offset_bases", _unwrap_shape(self.offset_bases))
+        super().__setattr__("block_bases", _unwrap_shape(self.block_bases))
+        super().__setattr__("shape", _unwrap_shape(self.shape))
+        super().__setattr__("alignment", _unwrap_if_constexpr(self.alignment))
+
+        rank = len(self.shape)
+        assert rank > 0, "SharedLinearLayout shape must not be empty"
+        for basis in self.offset_bases:
+            assert len(basis) == rank
+        for basis in self.block_bases:
+            assert len(basis) == rank
+        assert self.alignment > 0 and (self.alignment & (self.alignment - 1)) == 0, \
+            "SharedLinearLayout alignment must be a positive power of two"
+
+    def _to_ir(self, builder):
+        return builder.get_shared_linear_layout(self.offset_bases, self.block_bases, self.shape, self.alignment)
+
+    def mangle(self) -> str:
+        return f"SharedLinear_{self.offset_bases}_{self.block_bases}_{self.shape}_{self.alignment}_SharedLinear"
+
+    def __hash__(self):
+        return hash((
+            tuple(map(tuple, self.offset_bases)),
+            tuple(map(tuple, self.block_bases)),
+            tuple(self.shape),
+            self.alignment,
+        ))
+
+
 # Python impl of LinearEncodingAttr::basesPerDim
 def bases_per_dim(bases, rank, skip_broadcast=True):
     result = [1] * rank
diff --git a/test/TritonGPU/invalid.mlir b/test/TritonGPU/invalid.mlir
@@ -466,3 +466,9 @@ tt.func @async_copy_invalid_other_type(%input: tensor<64x64x!tt.ptr<f16>, #block
 #shared = #ttg.padded_shared<[4:+4] {offset=[[1, 0], [2, 0], [0, 1], [0, 2]], block=[]}>
 // expected-error @below {{Mismatch in expected shape for dimension 0. Expected: 4, got: 8}}
 !out_dim_too_large = !ttg.memdesc<8x8xf32, #shared, #ttg.shared_memory>
+
+// -----
+
+// expected-error @below {{alignment must be specified outside of the linear layout braces}}
+#shared = #ttg.shared_linear<{offset = [[0, 1], [0, 2], [1, 0], [2, 0]], block = [], alignment = 16}>
+!alignment_in_layout = !ttg.memdesc<4x4xf32, #shared, #ttg.shared_memory>
diff --git a/test/TritonGPU/ops.mlir b/test/TritonGPU/ops.mlir
@@ -257,3 +257,20 @@ tt.func @async_commit_group(%arg0: !ttg.async.token) {
   %1 = ttg.async_commit_group
   tt.return
 }
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [2, 2], warpsPerCTA = [1, 1], order = [1, 0]}>
+#shared = #ttg.shared_linear<{offset = [[0, 1], [0, 2], [1, 0], [2, 2]]}, alignment = 16>
+#smem = #ttg.shared_memory
+
+module attributes {"ttg.threads-per-warp" = 4 : i32, "ttg.num-warps" = 1 : i32} {
+  tt.func @round_trip(%arg0: tensor<4x4xf32, #blocked>) -> tensor<4x4xf32, #blocked> {
+    // CHECK: ttg.local_alloc
+    // CHECK-SAME: !ttg.memdesc<4x4xf32, #shared
+    %alloc = ttg.local_alloc %arg0 : (tensor<4x4xf32, #blocked>) -> !ttg.memdesc<4x4xf32, #shared, #smem, mutable>
+    ttg.local_store %arg0, %alloc : tensor<4x4xf32, #blocked> -> !ttg.memdesc<4x4xf32, #shared, #smem, mutable>
+    %loaded = ttg.local_load %alloc : !ttg.memdesc<4x4xf32, #shared, #smem, mutable> -> tensor<4x4xf32, #blocked>
+    tt.return %loaded : tensor<4x4xf32, #blocked>
+  }
+}

Original file line number	Diff line number	Diff line change
`@@ -83,6 +83,7 @@`
`83`	`83`	`NVMMASharedLayout,`
`84`	`84`	`SwizzledSharedLayout,`
`85`	`85`	`PaddedSharedLayout,`
	`86`	`+ SharedLinearLayout,`
`86`	`87`	`)`
`87`	`88`	`from ._math import (`
`88`	`89`	`umulhi,`