Introduce Subgroup 2D Block Encoding (#4193)

alexbaden · web-flow · commit b7dd07d1eef5 · 2025-06-08T13:53:57.000-04:00
Add a new layout to describe the tensor layout with respect to the GPU compute hierarchy (register, lane, warp, block). This PR introduces the layout and adds its definition and basic functions to the Triton Intel GPU Dialect. The conversion to Linear Layout function has been added and unit tested through an Intel specific `LinearLayoutConversionsTest`. The layouts are unpacked - each register is assumed to be the size of the tensor type. However, the layout generation follows the convention described in https://github.khronos.org/SPIRV-Registry/extensions/INTEL/[SPV_INTEL_2d_block_io](https://github.khronos.org/SPIRV-Registry/extensions/INTEL/SPV_INTEL_2d_block_io.html).html. While there may be some bugs, the goal is for any valid operation described in the SPIRV extension to be represented correctly with this layout. Currently the layout is unused other than for linear layout conversion testing purposes. I plan to leave this PR in draft until I have replaced the `block_io` attribute on the load ops with this layout - and then I plan to replace the linear layout code I added to `LoadStoreOpToLLVM.cpp`. That second task might prove challenging since I think the DPAS layouts do sometimes incorporate register packing schemes into the layout - but looking at the upstream layouts for NVIDIA and AMD MMA, specific packing is an implementation detail and not represented as part of the high-level layout encoding. cc #4192
diff --git a/third_party/intel/include/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.h b/third_party/intel/include/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.h
@@ -18,6 +18,11 @@ LinearLayout DPAStoLinearLayout(ArrayRef<int64_t> shape, Attribute layout,
 LinearLayout dotOperandDpasToLinearLayout(DotOperandEncodingAttr dotDpasLayout,
                                           ArrayRef<int64_t> shape);
 
+LinearLayout
+subgroup2DBlockToLinearLayout(ArrayRef<int64_t> shape,
+                              intel::Subgroup2DBlockEncodingAttr layout,
+                              unsigned kWidth);
+
 } // namespace mlir::triton::gpu
 
 #endif // TRITON_DIALECT_TRITONINTELGPU_IR_LINEARLAYOUTCONVERSIONS_H
diff --git a/third_party/intel/include/Dialect/TritonIntelGPU/IR/TritonIntelGPUAttrDefs.td b/third_party/intel/include/Dialect/TritonIntelGPU/IR/TritonIntelGPUAttrDefs.td
@@ -280,4 +280,47 @@ def WarpEncodingAttr : DistributedEncoding<"WarpEncoding", "intel_warp_encoding"
   let hasCustomAssemblyFormat = 1;
 }
 
+//===----------------------------------------------------------------------===//
+// Intel Subgroup2DBlock Encoding
+//===----------------------------------------------------------------------===//
+
+def Subgroup2DBlockEncodingAttr : DistributedEncoding<"Subgroup2DBlockEncoding", "subgroup_2d_block_encoding", [MmaEncodingTrait], TritonIntelGPU_Dialect> {
+  let mnemonic = "subgroup_2d_block";
+
+  let description = [{
+    An encoding for tensors produced via Intel Subgroup 2D Block IO operations.
+
+    The subgroup 2D block IO operations read or write two-dimensional blocks of data from a two-dimensional region of memory. The Subgroup 2D Block Encoding layout is parameterized by the block width, block height, and block count for the individual load instructions and the distribution and replication of loads across warps.
+
+    The SPV_INTEL_2d_block_io extension documentation provides more information on the subgroup 2D block IO operations and parameters: https://github.khronos.org/SPIRV-Registry/extensions/INTEL/SPV_INTEL_2d_block_io.html
+
+    For the layout, the following parameters are required:
+    - `instrShape` : contains the (height, width) block parameters for the block io operation
+    - `numBlocks` : the block count parameter allows a single load to load multiple blocks in row-major order (useful for increasing cache line utilization)
+    - `threadsPerWarp` : currently a scalar, this parameter allows us to support different subgroup / warp configurations. Because the 2d block io operation is a subgroup operation, the size of the subgroup is important in determining the ordering of the loaded tensor.
+    - `warpsPerCTA` : the number of warps per block / subgroups per workgroup and their distribution
+    - `order` : The order within the block, used to determine along which dimension to broadcast.
+    - `kWidth` : Currently unused, but keeping because we will likely need it for layout conversions.
+    - `CTALayout` : Describes how blocks are distributed among work-groups/thread blocks.
+  }];
+
+  let parameters = (
+    ins
+    ArrayRefParameter<"unsigned">:$warpsPerCTA,
+    "CTALayoutAttr":$CTALayout,
+    ArrayRefParameter<"unsigned">:$instrShape,
+    "unsigned":$numBlocks,
+    ArrayRefParameter<"unsigned">:$order,
+    "unsigned":$kWidth,
+    "unsigned":$threadsPerWarp
+  );
+
+  let extraClassDeclaration = extraDistributedDeclaration # [{
+    SmallVector<unsigned> getRepOrderForOperand(int opIdx) const;
+  }];
+
+  let hasCustomAssemblyFormat = 1;
+  let genVerifyDecl = 1;
+}
+
 #endif
diff --git a/third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp b/third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp
@@ -495,6 +495,173 @@ void WarpEncodingAttr::print(mlir::AsmPrinter &printer) const {
           << "}>";
 }
 
+//===----------------------------------------------------------------------===//
+// Subgroup2DBlockEncodingAttr
+//===----------------------------------------------------------------------===//
+
+namespace {
+std::optional<CTALayoutAttr> getCTALayoutOrError(
+    AsmParser &parser, std::optional<SmallVector<unsigned>> CTAsPerCGA,
+    std::optional<SmallVector<unsigned>> CTASplitNum,
+    std::optional<SmallVector<unsigned>> CTAOrder, unsigned rank) {
+  if (CTAsPerCGA && CTASplitNum && CTAOrder) {
+    return CTALayoutAttr::get(parser.getContext(), *CTAsPerCGA, *CTASplitNum,
+                              *CTAOrder);
+  }
+  if (!CTAsPerCGA && !CTASplitNum && !CTAOrder) {
+    return CTALayoutAttr::getDefault(parser.getContext(), rank);
+  }
+  parser.emitError(parser.getNameLoc(), "CTAsPerCGA, CTASplitNum, and CTAOrder "
+                                        "must all be present or all be absent");
+  return std::nullopt;
+}
+
+// Print the CTALayout if it's not equal to the default.
+void maybePrintCTALayout(mlir::MLIRContext *context, mlir::AsmPrinter &printer,
+                         CTALayoutAttr layout, unsigned rank) {
+  if (layout != CTALayoutAttr::getDefault(context, rank)) {
+    printer << ", CTAsPerCGA = [" << ArrayRef(layout.getCTAsPerCGA()) << "]"
+            << ", CTASplitNum = [" << ArrayRef(layout.getCTASplitNum()) << "]"
+            << ", CTAOrder = [" << ArrayRef(layout.getCTAOrder()) << "]";
+  }
+}
+
+} // namespace
+
+LogicalResult Subgroup2DBlockEncodingAttr::verify(
+    function_ref<InFlightDiagnostic()> emitError,
+    ArrayRef<unsigned> warpsPerCTA, CTALayoutAttr CTALayout,
+    ArrayRef<unsigned> instrShape, unsigned numBlocks, ArrayRef<unsigned> order,
+    unsigned kWidth, unsigned threadsPerWarp) {
+  if (instrShape.size() != 2) {
+    return emitError() << "instrShape must be rank 2 but was: "
+                       << instrShape.size();
+  }
+  if (order.size() != 2) {
+    return emitError() << "order must be rank 2 but was " << order.size();
+  }
+  if (warpsPerCTA.size() != 2) {
+    return emitError() << "warpsPerCTA must be rank 2 but was "
+                       << warpsPerCTA.size();
+  }
+  if (!(kWidth == 1 || kWidth == 2 || kWidth == 4)) {
+    return emitError() << "kWidth must be 1, 2 or 4, but was: " << kWidth;
+  }
+  if (!threadsPerWarp == 16) {
+    return emitError() << "threadsPerWarp must be 16, but was: "
+                       << threadsPerWarp;
+  }
+  return success();
+}
+
+Attribute Subgroup2DBlockEncodingAttr::parse(AsmParser &parser, Type type) {
+  if (parser.parseLess().failed())
+    return {};
+  DictionaryAttr dict;
+  if (parser.parseAttribute(dict).failed())
+    return {};
+  if (parser.parseGreater().failed())
+    return {};
+
+  SmallVector<unsigned> warpsPerCTA;
+  std::optional<SmallVector<unsigned>> CTAsPerCGA;
+  std::optional<SmallVector<unsigned>> CTASplitNum;
+  std::optional<SmallVector<unsigned>> CTAOrder;
+  SmallVector<unsigned> instrShape;
+  unsigned numBlocks = 0;
+  SmallVector<unsigned> order;
+  unsigned kWidth = 0;
+  unsigned threadsPerWarp = 0;
+
+  for (const NamedAttribute &attr : dict) {
+    if (attr.getName() == "warpsPerCTA") {
+      if (parseIntArrayAttr(parser, attr, warpsPerCTA, "warpsPerCTA").failed())
+        return {};
+    }
+    if (attr.getName() == "CTAsPerCGA") {
+      if (parseIntArrayAttr(parser, attr, CTAsPerCGA.emplace(), "CTAsPerCGA")
+              .failed())
+        return {};
+    }
+    if (attr.getName() == "CTASplitNum") {
+      if (parseIntArrayAttr(parser, attr, CTASplitNum.emplace(), "CTASplitNum")
+              .failed())
+        return {};
+    }
+    if (attr.getName() == "CTAOrder") {
+      if (parseIntArrayAttr(parser, attr, CTAOrder.emplace(), "CTAOrder")
+              .failed())
+        return {};
+    }
+    if (attr.getName() == "instrShape") {
+      if (parseIntArrayAttr(parser, attr, instrShape, "instrShape").failed())
+        return {};
+    }
+    if (attr.getName() == "numBlocks") {
+      if (parseUInt(parser, attr, numBlocks, "numBlocks").failed())
+        return {};
+    }
+    if (attr.getName() == "order") {
+      if (parseIntArrayAttr(parser, attr, order, "order").failed())
+        return {};
+    }
+    if (attr.getName() == "kWidth") {
+      if (parseUInt(parser, attr, kWidth, "kWidth").failed())
+        return {};
+    }
+    if (attr.getName() == "threadsPerWarp") {
+      if (parseUInt(parser, attr, threadsPerWarp, "threadsPerWarp").failed())
+        return {};
+    }
+  }
+
+  std::optional<CTALayoutAttr> CTALayout = getCTALayoutOrError(
+      parser, CTAsPerCGA, CTASplitNum, CTAOrder, /*rank=*/warpsPerCTA.size());
+  if (!CTALayout.has_value())
+    return {};
+
+  return parser.getChecked<Subgroup2DBlockEncodingAttr>(
+      parser.getContext(), warpsPerCTA, *CTALayout, instrShape, numBlocks,
+      order, kWidth, threadsPerWarp);
+}
+
+SmallVector<unsigned> Subgroup2DBlockEncodingAttr::getRepOrder() const {
+  return getMatrixOrder(getRank(), /*rowMajor*/ true);
+}
+
+SmallVector<unsigned> Subgroup2DBlockEncodingAttr::getCTAsPerCGA() const {
+  return SmallVector<unsigned>(getCTALayout().getCTAsPerCGA());
+}
+
+SmallVector<unsigned> Subgroup2DBlockEncodingAttr::getCTAOrder() const {
+  return SmallVector<unsigned>(getCTALayout().getCTAOrder());
+}
+
+SmallVector<unsigned> Subgroup2DBlockEncodingAttr::getCTASplitNum() const {
+  return SmallVector<unsigned>(getCTALayout().getCTASplitNum());
+}
+
+SmallVector<unsigned>
+Subgroup2DBlockEncodingAttr::getRepOrderForOperand(int opIdx) const {
+  return getOrderForDotOperand(opIdx, getRank(), /*kContig*/ true);
+}
+
+void Subgroup2DBlockEncodingAttr::print(AsmPrinter &printer) const {
+  printer << "<{" << "warpsPerCTA = [" << ArrayRef(getWarpsPerCTA()) << "]";
+
+  maybePrintCTALayout(getContext(), printer, getCTALayout(), getRank());
+
+  printer << ", instrShape = [" << getInstrShape()
+          << "], numBlocks=" << getNumBlocks() << ", order=[" << getOrder()
+          << "], kWidth=" << getKWidth()
+          << ", threadsPerWarp=" << getThreadsPerWarp() << "}>";
+}
+
+LinearLayout
+Subgroup2DBlockEncodingAttr::toLinearLayout(ArrayRef<int64_t> shape) const {
+  return subgroup2DBlockToLinearLayout(shape, *this, getKWidth());
+}
+
 //===----------------------------------------------------------------------===//
 // Dialect Interface
 //===----------------------------------------------------------------------===//
diff --git a/third_party/intel/lib/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.cpp b/third_party/intel/lib/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.cpp
@@ -523,4 +523,119 @@ LinearLayout dotOperandDpasToLinearLayout(DotOperandEncodingAttr dotDpasLayout,
   return DPAStoLinearLayout(shape, dpasLayout, dotDpasLayout.getOpIdx());
 }
 
+namespace {
+
+static LinearLayout broadcastedDotOperandLayout(MLIRContext *ctx,
+                                                ArrayRef<unsigned> shape,
+                                                ArrayRef<unsigned> order,
+                                                unsigned broadcastDim,
+                                                StringAttr inDimName) {
+  int rank = shape.size();
+  auto dimNames = standardOutDimNames(ctx, rank);
+  LinearLayout layout = LinearLayout::empty();
+
+  for (auto d : order) {
+    if (d == broadcastDim) {
+      layout *= LinearLayout::zeros1D(shape[d], inDimName, dimNames[d]);
+    } else {
+      layout *= LinearLayout::identity1D(shape[d], inDimName, dimNames[d]);
+    }
+  }
+  return layout;
+}
+
+using basisT = std::vector<std::vector<int32_t>>;
+
+// Creates a row major tile layout with register/lane input dimensions according
+// to the provided height, width, and threadsPerWarp. The relationship between
+// the width and threadsPerWarp determines the packing of rows across lanes:
+//  - if width == threadsPerWarp:
+//      block row elements are mapped to registers in row major order, i.e. one
+//      column per lane
+// - if width < threadsPerWarp:
+//      multiple rows are mapped to the first register to fill the warp, i.e.
+//      width * rowsPerWarp = threadsPerWarp
+// - if width > threadsPerWarp:
+//      multiple elements of each row are assigned to registers such that
+//      packedElementsPerLane row values exist in consecutive registers for each
+//      lane
+std::pair<basisT, basisT>
+createRegisterLaneBases(const int height, const int width,
+                        const unsigned threadsPerWarp) {
+  const int packedElementsPerLane =
+      mlir::ceil<int>(width, static_cast<int>(threadsPerWarp));
+
+  basisT laneBases;
+  for (int i = packedElementsPerLane; i < width; i = i << 1) {
+    laneBases.push_back({0, i});
+  }
+
+  const int rowsPerWarp =
+      mlir::ceil<int>(threadsPerWarp, 1 << laneBases.size());
+  // Place subsequent rows into adjacent lanes until all lanes have been filled
+  for (int i = 1; i < rowsPerWarp; i = i << 1) {
+    laneBases.push_back({i, 0});
+  }
+
+  basisT regBases;
+
+  // Add packed row-wise elements (width > threadsPerWarp) before adding columns
+  for (int i = 1; i < packedElementsPerLane; i = i << 1) {
+    regBases.push_back({0, i});
+  }
+
+  for (int i = 1; i < height / rowsPerWarp; i = i << 1) {
+    regBases.push_back({i * rowsPerWarp, 0});
+  }
+
+  return std::make_pair(regBases, laneBases);
+}
+
+} // namespace
+
+LinearLayout
+subgroup2DBlockToLinearLayout(ArrayRef<int64_t> blockShape,
+                              intel::Subgroup2DBlockEncodingAttr layout,
+                              unsigned kWidth) {
+  auto ctx = layout.getContext();
+  int rank = blockShape.size();
+  assert(rank == layout.getRank() && "unexpected block shape rank, layout rank "
+                                     "and block shape rank must be equal");
+  auto dimNames = standardOutDimNames(ctx, rank);
+  auto loadTileSize = layout.getInstrShape();
+  StringAttr kRegister = S("register");
+  StringAttr kLane = S("lane");
+  StringAttr kWarp = S("warp");
+
+  // Start by creating register/lane bases corresponding to the desired load
+  // tile size
+  auto [regBases, laneBases] = createRegisterLaneBases(
+      loadTileSize[0], loadTileSize[1], layout.getThreadsPerWarp());
+
+  LinearLayout::BasesT bases;
+  bases[kRegister] = regBases;
+  bases[kLane] = laneBases;
+  auto ctaLayout = LinearLayout(bases, dimNames);
+
+  assert(ctaLayout.getInDimSize(kLane) <= layout.getThreadsPerWarp() &&
+         "number of lanes should not exceed threads per warp");
+
+  // Increasing the block count always increases the inner dimension for the
+  // register/lane layout regardless of order
+  ctaLayout *=
+      LinearLayout::identity1D(layout.getNumBlocks(), kRegister, dimNames[1]);
+
+  // Broadcast the layout according to warpsPerCTA, then combine with the
+  // overall CTALayout and reshape according to the provided blockShape.
+  auto warpOrder = getMatrixOrder(rank, /*rowMajor*/ true);
+  auto order = layout.getOrder();
+  assert(order.size() == 2 && "only rank 2 order supported");
+  unsigned inner = order[0];
+
+  ctaLayout *= broadcastedDotOperandLayout(ctx, layout.getWarpsPerCTA(),
+                                           warpOrder, inner, kWarp)
+                   .transposeOuts(llvm::to_vector(ctaLayout.getOutDimNames()));
+  return combineCtaCgaWithShape(ctaLayout, layout.getCTALayout(), blockShape);
+}
+
 } // namespace mlir::triton::gpu
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -1965,6 +1965,20 @@ struct LoadOpConversion
     }
     Value elemSizeInBytes = b.i32_val(originalElemBits / 8);
 
+    LLVM_DEBUG({
+      const unsigned numLoads = numRepOuter * numLoadPerOutRepCluster *
+                                numRepInner / numOperandsInnerDimPerLoad;
+      llvm::dbgs() << "Preparing to dispatch " << numLoads << " loads\n";
+      llvm::dbgs() << "Outer loads: " << numRepOuter * numLoadPerOutRepCluster
+                   << " (" << numLoadPerOutRepCluster
+                   << " per out rep cluster)\n";
+      llvm::dbgs() << "Inner loads: "
+                   << numRepInner / numOperandsInnerDimPerLoad << "\n";
+      llvm::dbgs() << "Load dimension: " << tileHeight << ", "
+                   << tileWidth * vBlocks << " (" << elemSizeInBits
+                   << " bits)\n";
+    });
+
     ValueTable loadVals;
     for (int outer = 0; outer < numRepOuter; ++outer) {
       for (int rep = 0; rep < numLoadPerOutRepCluster; ++rep) {
diff --git a/third_party/intel/unittest/Dialect/TritonIntelGPU/CMakeLists.txt b/third_party/intel/unittest/Dialect/TritonIntelGPU/CMakeLists.txt
@@ -8,3 +8,13 @@ add_triton_ut(
     TritonIntelGPUTransforms
     TritonNvidiaGPUTransforms
 )
+add_triton_ut(
+	NAME LinearLayoutConversionsIntel
+	SRCS LinearLayoutConversionsTest.cpp
+	LIBS
+		TritonGPUIR
+		TritonGPUTransforms
+		TritonIntelAnalysis
+		TritonIntelGPUTransforms
+		TritonNvidiaGPUTransforms
+)
diff --git a/third_party/intel/unittest/Dialect/TritonIntelGPU/LinearLayoutConversionsTest.cpp b/third_party/intel/unittest/Dialect/TritonIntelGPU/LinearLayoutConversionsTest.cpp