Move tile size computation for subgroup 2d block io encoding (#4461)

alexbaden · web-flow · commit c7b3773342e8 · 2025-06-12T21:50:35.000-04:00
Moves the tile size computation from LLVM lowering to a static method on
the Subgroup 2D block encoding layout. This allows us to create the
layout with the desired tile sizes at a higher level in the pass
hierarchy. Moving this functionality now allows us to test using the
existing layouts and lowering, ensuring no regressions.

There is some cleanup that could be done but I opted for generic objects
for now (e.g. `SmallVector`) for flexibility.
diff --git a/third_party/intel/include/Dialect/TritonIntelGPU/IR/TritonIntelGPUAttrDefs.td b/third_party/intel/include/Dialect/TritonIntelGPU/IR/TritonIntelGPUAttrDefs.td
@@ -317,6 +317,7 @@ def Subgroup2DBlockEncodingAttr : DistributedEncoding<"Subgroup2DBlockEncoding",
 
   let extraClassDeclaration = extraDistributedDeclaration # [{
     SmallVector<unsigned> getRepOrderForOperand(int opIdx) const;
+    static SmallVector<unsigned, 3> getInstrShapeForLayout(DistributedEncodingTrait layout, ArrayRef<int64_t> shape, bool memoryRowMajor, unsigned kWidth, MLIRContext* context);
   }];
 
   let hasCustomAssemblyFormat = 1;
diff --git a/third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp b/third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp
@@ -662,6 +662,92 @@ Subgroup2DBlockEncodingAttr::toLinearLayout(ArrayRef<int64_t> shape) const {
   return subgroup2DBlockToLinearLayout(shape, *this, getKWidth());
 }
 
+SmallVector<unsigned, 3> Subgroup2DBlockEncodingAttr::getInstrShapeForLayout(
+    DistributedEncodingTrait layout, ArrayRef<int64_t> tensorShape,
+    bool memoryRowMajor, unsigned kWidth, MLIRContext *context) {
+  const auto rank = tensorShape.size();
+
+  std::optional<LinearLayout> llEncoding = layout.toLinearLayout(tensorShape);
+  assert(llEncoding.has_value() && "invalid dot layout to linear layout");
+  LinearEncodingAttr llAttr = LinearEncodingAttr::get(context, *llEncoding);
+  SmallVector<unsigned> threadOrder = llAttr.getThreadOrder();
+
+  const bool valueRowMajor =
+      (threadOrder[rank - 2] == 1 && threadOrder[rank - 1] == 0);
+  assert((valueRowMajor ||
+          (threadOrder[rank - 2] == 0 && threadOrder[rank - 1] == 1)) &&
+         "Only row_major or column_major is allowed");
+  const bool isTransposeRequired = valueRowMajor ^ memoryRowMajor;
+
+  auto dotEncodingAttr = dyn_cast<DotOperandEncodingAttr>(layout);
+  const unsigned opIdx = dotEncodingAttr ? dotEncodingAttr.getOpIdx() : 2;
+
+  // TODO: can this be moved into the DpasEncodingAttr layout?
+  auto getDPASInstShape = [](const auto dpasLayout, const unsigned opIdx) {
+    switch (opIdx) {
+    case 0:
+      return dpasLayout.getDPASInstShapeA();
+    case 1:
+      return dpasLayout.getDPASInstShapeB();
+    case 2:
+      return dpasLayout.getDPASInstShapeC();
+    default:
+      llvm_unreachable("invalid opidx");
+    }
+  };
+
+  DpasEncodingAttr dpasLayout =
+      dotEncodingAttr ? cast<DpasEncodingAttr>(dotEncodingAttr.getParent())
+                      : cast<DpasEncodingAttr>(layout);
+  assert(dpasLayout && "only dpas layout is supported");
+
+  const SmallVector<unsigned> dpasInstShape =
+      getDPASInstShape(dpasLayout, opIdx);
+  const SmallVector<unsigned> elemsPerDPASInst = {dpasInstShape[0],
+                                                  dpasInstShape[1]};
+  unsigned tileWidth = elemsPerDPASInst[threadOrder[rank - 2]];
+  unsigned tileHeight = elemsPerDPASInst[threadOrder[rank - 1]];
+
+  if (opIdx == 2) {
+    return {tileHeight, tileWidth, 1};
+  }
+
+  // For the A and B matrices, enlarge the tile size to support multiple DPAS
+  // operands
+  ArrayRef<unsigned> repCluster = dpasLayout.getRepCluster();
+  SmallVector<int64_t> numReps =
+      dpasLayout.getDPASRepetitions(tensorShape, opIdx);
+
+  const bool isOperandA = opIdx == 0;
+  const unsigned dimOuter = bool(opIdx) ? rank - 1 : rank - 2;
+  unsigned dpasOperandsPerTileX =
+      isOperandA ? repCluster[dimOuter] : numReps[unsigned(opIdx) ? 1 : 2];
+  unsigned dpasOperandsPerTileY =
+      isOperandA ? numReps[unsigned(opIdx) ? 1 : 2] : repCluster[dimOuter];
+
+  if (isTransposeRequired) {
+    std::swap(tileWidth, tileHeight);
+
+    const unsigned threadsPerWarp = dpasLayout.getThreadsPerWarp();
+    dpasOperandsPerTileX =
+        (threadsPerWarp <= tileHeight) ? repCluster[rank - 1] : 1;
+
+    // limit transpose loads to HW's limitations (what are those...?)
+    tileWidth = tileWidth / (32 / (kWidth * 8));
+
+    dpasOperandsPerTileY = 1;
+  }
+
+  // PVC 2D load supports 64 bytes per row at most. Load multiple dot operands
+  // by enlarging the number of blocks.
+  const unsigned totalBytesPerRowPerDPASOp = tileWidth * kWidth;
+  dpasOperandsPerTileY =
+      std::min(dpasOperandsPerTileY, 64 / totalBytesPerRowPerDPASOp);
+  const unsigned numBlocks = dpasOperandsPerTileY;
+
+  return {tileHeight, tileWidth, numBlocks};
+}
+
 //===----------------------------------------------------------------------===//
 // Dialect Interface
 //===----------------------------------------------------------------------===//
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -1439,6 +1439,14 @@ struct LoadOpConversion
 
     Type eltTy = tensorType.getElementType();
     unsigned elemSizeInBits = eltTy.getIntOrFloatBitWidth();
+
+    auto tileParams = Subgroup2DBlockEncodingAttr::getInstrShapeForLayout(
+        cast<DistributedEncodingTrait>(encoding), tensorType.getShape(),
+        memoryRowMajor, elemSizeInBits / 8, rewriter.getContext());
+    unsigned tileHeight = tileParams[0];
+    const unsigned tileWidth = tileParams[1];
+    const unsigned vBlocks = tileParams[2];
+
     DpasEncodingAttr dpasLayout = getDpasLayout(tensorType);
     const ArrayRef<int64_t> tensorShape = tensorType.getShape();
     unsigned numElems = getTotalElemsPerThread(resultType);
@@ -1476,8 +1484,7 @@ struct LoadOpConversion
 
       Value elemSizeInBytes = b.i32_val(elemSizeInBits / 8);
 
-      SmallVector<unsigned> elemsPerInstr = dpasLayout.getDPASInstShapeC();
-      int64_t elemsPerLane = product<unsigned>(elemsPerInstr) / threadsPerWarp;
+      const unsigned elemsPerLane = tileWidth * tileHeight / threadsPerWarp;
       Type load2DGenXType =
           LLVM::getVectorType(IntegerType::get(ctx, elemSizeInBits),
                               elemsPerLane); // make it opaque type.
@@ -1527,12 +1534,12 @@ struct LoadOpConversion
           for (int repM = 0; repM < repCluster[0]; ++repM) {
 
             Value offsetY =
-                b.add(warpId0Offset, b.i32_val(m * replicaStride[0] +
-                                               repM * elemsPerInstr[0]));
+                b.add(warpId0Offset,
+                      b.i32_val(m * replicaStride[0] + repM * tileHeight));
             for (int repN = 0; repN < repCluster[1]; ++repN) {
               Value offsetX =
-                  b.add(warpId1Offset, b.i32_val(n * replicaStride[1] +
-                                                 repN * elemsPerInstr[1]));
+                  b.add(warpId1Offset,
+                        b.i32_val(n * replicaStride[1] + repN * tileWidth));
 
               auto load2dOp = rewriter.create<TritonGEN::Matrix2DBlockLoadOp>(
                   loc, load2DGenXType,
@@ -1543,9 +1550,9 @@ struct LoadOpConversion
                   /*x*/ b.trunc(i32_ty, offsetX),
                   /*y*/ b.trunc(i32_ty, offsetY),
                   /*elem_size_in_bits*/ elemSizeInBits,
-                  /*tile_width*/ elemsPerInstr[1],
-                  /*tile_height*/ elemsPerInstr[0],
-                  /*v_blocks*/ 1,
+                  /*tile_width*/ tileWidth,
+                  /*tile_height*/ tileHeight,
+                  /*v_blocks*/ vBlocks,
                   /*transpose*/ false,
                   /*vnni_transform*/ false);
               if (failed(load2dOp.verify())) {
@@ -1659,9 +1666,6 @@ struct LoadOpConversion
           offsetBaseY] =
         getValuesFromBlockPointerStruct(adaptor.getPtr(), rewriter);
 
-    unsigned tileWidth = elemsPerDPASInst[threadOrder[rank - 2]];
-    unsigned tileHeight = elemsPerDPASInst[threadOrder[rank - 1]];
-
     MLIRContext *ctx = rewriter.getContext();
     const StringAttr dimOuterStr = S("dim" + std::to_string(dimOuter));
     const StringAttr dimInnerStr = S("dim" + std::to_string(dimInner));
@@ -1739,7 +1743,6 @@ struct LoadOpConversion
       llvm::dbgs() << "tile layout done\n";
     });
 
-    unsigned vBlocks = 1;
     unsigned numOperandsOuterDimPerLoad = 1;
     unsigned numOperandsInnerDimPerLoad = 1;
 
@@ -1756,11 +1759,10 @@ struct LoadOpConversion
       if (!usePackedType)
         return failure();
 
-      std::swap(tileHeight, tileWidth);
-
       if (oneMatrixPerLoadForBT) {
         // Only load 1 operand per inst on row.
         numOperandsPer2DLoadM = 1;
+        tileHeight = elemsPerDPASInst[threadOrder[rank - 2]];
       } else {
         // We can decompose the matrix returned by transposed large 2d load
         // when threads per warp < column size. Otherwise we have to load one
@@ -1775,6 +1777,7 @@ struct LoadOpConversion
       numOperandsPer2DloadN = 1;
     }
 
+    // TODO: move this logic to the instr shape computation
     // PVC 2D load supports 32 rows at most. Load multiple dot operands in by
     // enlarging the tileHeight.
     numOperandsPer2DLoadM = std::min(numOperandsPer2DLoadM, 32 / tileHeight);
@@ -1785,7 +1788,6 @@ struct LoadOpConversion
     unsigned totalBytesPerRowPerDPASOp = tileWidth * elemSizeInBits / 8;
     numOperandsPer2DloadN =
         std::min(numOperandsPer2DloadN, 64 / totalBytesPerRowPerDPASOp);
-    vBlocks = numOperandsPer2DloadN;
 
     numOperandsOuterDimPerLoad =
         isOperandA ? numOperandsPer2DLoadM : numOperandsPer2DloadN;
@@ -1960,7 +1962,6 @@ struct LoadOpConversion
     if (isTransposeRequired) {
       // adjust the block io parameter to align HW's limitations on
       // transposing load.
-      tileWidth = tileWidth / (32 / originalElemBits);
       elemSizeInBits = 32;
     }
     Value elemSizeInBytes = b.i32_val(originalElemBits / 8);