Fix LinearLayout and ConvertLayout to LLVM

leonling-ll · leonling-ll · commit 2dd1d03ada34 · 2024-10-23T15:48:34.000Z
diff --git a/third_party/intel/lib/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.cpp b/third_party/intel/lib/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.cpp
@@ -285,7 +285,7 @@ LinearLayout ensureLayoutNotSmallerThan(
   assert(kDim == "register" || kDim == "offset" && "unexpected kDim");
 
   LinearLayout ret = layout;
-  for (StringAttr outDimName : llvm::reverse(layout.getOutDimNames())) {
+  for (StringAttr outDimName : layout.getOutDimNames()) {
     int32_t actualSize = layout.getOutDimSize(outDimName);
     int32_t desiredSize = shape.lookup(outDimName);
     assert(actualSize > desiredSize ||
@@ -548,7 +548,7 @@ LinearLayout DPAStoLinearLayout(ArrayRef<int64_t> shape, Attribute layout,
     auto laneBasesA =
         DPASLaneBasesA(opsPerChannel, threadsPerWarp, systolicDepth);
     tileLayout = LinearLayout({{kRegister, regBasesA}, {kLane, laneBasesA}},
-                              outDimNames);
+                              ArrayRef(outDimNames).take_back(2));
     // A only repeats by repCluster[rank - 2]
     dimNonK = rank - 2;
     dimK = rank - 1;
@@ -622,22 +622,33 @@ LinearLayout DPAStoLinearLayout(ArrayRef<int64_t> shape, Attribute layout,
     if (rank == 3)
       tileLayout *=
           LinearLayout::identity1D(warpsPerCTA[0], kWarp, outDimNames[0]);
-    // std::cout << (tileLayout.toString()) << std::endl;
+    std::cout << (tileLayout.toString()) << std::endl;
   }
 
   // Lastly, the layout repeats to match the shape.
   // Operand A/B repeats through the K-dimension first then repeats
   // through the non-K dimension.
-  // SmallVector<int64_t> numReps = dpas.getDPASRepetitions(shape, opIdx);
-  // std::cout << "numReps: " << numReps[0] << ", " << numReps[1] << std::endl;
-  // tileLayout *=
-  //     LinearLayout::identity1D(numReps[dimK], kRegister, outDimNames[dimK]);
-  // tileLayout *= LinearLayout::identity1D(numReps[dimNonK], kRegister,
-  //                                        outDimNames[dimNonK]);
-  // if (rank == 3)
-  //   tileLayout *=
-  //       LinearLayout::identity1D(numReps[0], kRegister, outDimNames[0]);
-  // std::cout << (tileLayout.toString()) << std::endl;
+  SmallVector<int64_t> numReps = dpas.getDPASRepetitions(shape, opIdx);
+
+  std::cout << "numReps: ";
+  for (auto numRep : numReps) {
+    std::cout << numRep << ", ";
+  }
+  std::cout << std::endl;
+
+  // numReps is always 3D, we should add 1 to dim id when rank is 2
+  int repDimK = rank == 2 ? dimK + 1 : dimK;
+  int repDimNonK = rank == 2 ? dimNonK + 1 : dimNonK;
+  tileLayout *=
+      LinearLayout::identity1D(numReps[repDimK], kRegister, outDimNames[dimK]);
+  tileLayout *= LinearLayout::identity1D(numReps[repDimNonK], kRegister,
+                                         outDimNames[dimNonK]);
+  std::cout << "rank: " << rank << std::endl;
+  if (rank == 3)
+    tileLayout *=
+        LinearLayout::identity1D(numReps[0], kRegister, outDimNames[0]);
+  std::cout << "\ntileLayout with DPASRepetition: " << (tileLayout.toString())
+            << std::endl;
 
   return combineCtaCgaWithShape(std::move(tileLayout),
                                 CTALayoutAttr::getDefault(ctx, rank), shape);
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/ConvertLayoutOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/ConvertLayoutOpToLLVM.cpp
@@ -1,6 +1,7 @@
 #include "PatternTritonGPUOpToLLVM.h"
 #include "TargetInfo.h"
 #include "Utility.h"
+#include <iostream>
 
 #include "intel/include/Analysis/Utility.h"
 #include "intel/include/Dialect/TritonIntelGPU/IR/Dialect.h"
@@ -110,17 +111,22 @@ struct ConvertLayoutOpConversion
       return multiDimOffset;
     }
     if (auto dpasLayout = dyn_cast<DpasEncodingAttr>(layout)) {
-      assert(rank == 2);
+      assert(rank == 2 || rank == 3);
+      std::cout << "!!!getMultiDimOffset: dpasLayout" << std::endl;
       auto multiDimBase = ::intel::emitBaseIndexForLayout(
           loc, rewriter, targetInfo, layout, type, false);
       SmallVector<SmallVector<unsigned>> offsets;
       ::emitOffsetForDpasLayoutPerCTA(
           dpasLayout, offsets, multiDimCTAInRepId[0] * shapePerCTATile[0],
           multiDimCTAInRepId[1] * shapePerCTATile[1]);
 
-      SmallVector<Value> multiDimOffset = {
-          add(multiDimBase[0], i32_val(offsets[elemId][0])),
-          add(multiDimBase[1], i32_val(offsets[elemId][1]))};
+      SmallVector<Value> multiDimOffset(rank);
+      if (rank == 3)
+        multiDimOffset[0] = multiDimBase[0];
+      multiDimOffset[rank - 2] =
+          add(multiDimBase[rank - 2], i32_val(offsets[elemId][rank - 2]));
+      multiDimOffset[rank - 1] =
+          add(multiDimBase[rank - 1], i32_val(offsets[elemId][rank - 1]));
 
       return multiDimOffset;
     }
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/DotOpToLLVM/DPAS.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/DotOpToLLVM/DPAS.cpp
@@ -168,7 +168,7 @@ class DotOpDPASConversionHelper {
            "A and B precision enumerators do not match");
 
     LLVM_DEBUG({
-      llvm::dbgs() << "repB = " << repBatch << "\n";
+      llvm::dbgs() << "repBatch = " << repBatch << "\n";
       llvm::dbgs() << "repM = " << repM << "\n";
       llvm::dbgs() << "repK = " << repK << "\n";
       llvm::dbgs() << "repN = " << repN << "\n";
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -609,11 +609,10 @@ struct LoadOpConversion
 
     unsigned numOperandsPer2DLoadM, numOperandsPer2DloadN;
     if (!isTransposeRequired) {
-      int dimRep = opIdx ? 1 : 2;
       numOperandsPer2DLoadM =
-          isOperandA ? repCluster[dimOuter] : numReps[dimRep];
+          isOperandA ? repCluster[dimOuter] : numReps[opIdx ? 1 : 2];
       numOperandsPer2DloadN =
-          isOperandA ? numReps[dimRep] : repCluster[dimOuter];
+          isOperandA ? numReps[opIdx ? 1 : 2] : repCluster[dimOuter];
     } else {
       if (isOperandA)
         return failure();
@@ -671,8 +670,8 @@ struct LoadOpConversion
     unsigned warpOuterStride = warpShape[dimOuter];
     unsigned repKStride = elemsPerDPASInst[dimInner];
 
-    unsigned numRepOuter = numReps[dimOuter];
-    unsigned numRepInner = numReps[dimInner];
+    unsigned numRepOuter = numReps[opIdx ? 2 : 1];
+    unsigned numRepInner = numReps[opIdx ? 1 : 2];
 
     Value pitch;
     if (memoryRowMajor) {
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/Utility.h b/third_party/intel/lib/TritonIntelGPUToLLVM/Utility.h
@@ -164,8 +164,9 @@ emitOffsetForDpasLayoutPerCTA(const DpasEncodingAttr &dpasLayout,
   SmallVector<unsigned> sizePerThreads = getSizePerThread(dpasLayout);
   ArrayRef<unsigned> repCluster = dpasLayout.getRepCluster();
   size_t rank = repCluster.size();
-  SmallVector<unsigned> sizePerDPASInst = {sizePerThreads[0] / repCluster[0],
-                                           sizePerThreads[1] / repCluster[1]};
+  SmallVector<unsigned> sizePerDPASInst = {
+      sizePerThreads[rank - 2] / repCluster[rank - 2],
+      sizePerThreads[rank - 1] / repCluster[rank - 1]};
 
   unsigned rowsPerElem = dpasLayout.getSubGroupSize() / instShapeC[1];
   unsigned colsPerElem = 1;
@@ -176,15 +177,19 @@ emitOffsetForDpasLayoutPerCTA(const DpasEncodingAttr &dpasLayout,
     for (unsigned elemId = 0; elemId < elemNumberPerRep; ++elemId) {
       // Follows the C++ order for the dpas layout.
       SmallVector<unsigned> repOffset = {
-          (repId / repCluster[1]) * instShapeC[0],
-          (repId % repCluster[1]) * instShapeC[1]};
+          (repId / repCluster[rank - 1]) * instShapeC[0],
+          (repId % repCluster[rank - 1]) * instShapeC[1]};
 
       SmallVector<unsigned> elemOffset = {
           (elemId / sizePerDPASInst[1]) * rowsPerElem,
           (elemId % sizePerDPASInst[1]) * colsPerElem};
 
-      offsets.push_back({repOffset[0] + elemOffset[0] + ctaOffsetX,
-                         repOffset[1] + elemOffset[1] + ctaOffsetY});
+      if (rank == 3)
+        offsets.push_back({0, repOffset[0] + elemOffset[0] + ctaOffsetX,
+                           repOffset[1] + elemOffset[1] + ctaOffsetY});
+      else
+        offsets.push_back({repOffset[0] + elemOffset[0] + ctaOffsetX,
+                           repOffset[1] + elemOffset[1] + ctaOffsetY});
     }
   }
 }
@@ -289,9 +294,10 @@ emitOffsetForDpasLayout(const DpasEncodingAttr &dpasLayout,
   ArrayRef<int64_t> shape = type.getShape();
   SmallVector<SmallVector<unsigned>> offsets;
   SmallVector<unsigned> shapePerCTA = getShapePerCTATile(dpasLayout);
+  size_t rank = shape.size();
 
-  for (unsigned i = 0; i < shape[0]; i += shapePerCTA[0]) {
-    for (unsigned j = 0; j < shape[1]; j += shapePerCTA[1]) {
+  for (unsigned i = 0; i < shape[rank - 2]; i += shapePerCTA[rank - 2]) {
+    for (unsigned j = 0; j < shape[rank - 1]; j += shapePerCTA[rank - 1]) {
       emitOffsetForDpasLayoutPerCTA(dpasLayout, offsets, i, j);
     }
   }
@@ -333,13 +339,14 @@ emitBaseIndexForDotOpLayout(Location loc, RewriterBase &rewriter,
   size_t rank = warpShape.size();
   assert(rank == shapePerCTA.size() && "Rank mismatch");
   Value warpIndex =
-      (opIdx == 0) ? urem(multiDimWarpId[0],
+      (opIdx == 0) ? urem(multiDimWarpId[rank - 2],
                           i32_val(mlir::ceil<unsigned>(shapePerCTA[rank - 2],
                                                        warpShape[rank - 2])))
-                   : urem(multiDimWarpId[1],
+                   : urem(multiDimWarpId[rank - 1],
                           i32_val(mlir::ceil<unsigned>(shapePerCTA[rank - 1],
                                                        warpShape[rank - 1])));
-  Value warpOffset = mul(warpIndex, i32_val(warpShape[opIdx]));
+  Value warpOffset =
+      mul(warpIndex, i32_val(warpShape[opIdx ? rank - 1 : rank - 2]));
 
   // Compute the 2-dim coordinates of the first element in the warp operated
   // own by this thread.
@@ -355,7 +362,7 @@ emitBaseIndexForDotOpLayout(Location loc, RewriterBase &rewriter,
     // Unlike the operand B, to pack the value to i16 for scalar bit width
     // <=16.
     unsigned packedOpsPerLane = opsPerChannel == 4 ? 2 : 1;
-    unsigned packedColNum = shapeA[1] / packedOpsPerLane;
+    unsigned packedColNum = shapeA[rank - 1] / packedOpsPerLane;
     if (warpSize < packedColNum)
       llvm::report_fatal_error(
           "DpasEncodingAttr sub-group size could not "
@@ -375,12 +382,18 @@ emitBaseIndexForDotOpLayout(Location loc, RewriterBase &rewriter,
     laneRowIndex = mul(laneRowIndex, i32_val(opsPerChannel));
     laneColIndex = urem(laneId, i32_val(executionSize));
   } break;
+  default: {
+    llvm::report_fatal_error("Only support opIdx 1 or 0 for DotOpLayout.");
+  }
   }
 
-  auto multiDimBase =
-      (opIdx == 0)
-          ? SmallVector<Value>{add(laneRowIndex, warpOffset), laneColIndex}
-          : SmallVector<Value>{laneRowIndex, add(laneColIndex, warpOffset)};
+  SmallVector<Value> multiDimBase(rank);
+  if (rank == 3)
+    multiDimBase[0] = multiDimWarpId[0];
+  multiDimBase[rank - 2] =
+      (opIdx == 0) ? add(laneRowIndex, warpOffset) : laneRowIndex;
+  multiDimBase[rank - 1] =
+      (opIdx == 0) ? laneColIndex : add(laneColIndex, warpOffset);
 
   return multiDimBase;
 }
@@ -394,6 +407,7 @@ emitBaseIndexForDpasLayout(Location loc, RewriterBase &rewriter,
   Value warpId = udiv(threadId, warpSize);
   Value laneId = urem(threadId, warpSize);
 
+  unsigned rank = type.getShape().size();
   auto warpsPerCTA = dpasLayout.getWarpsPerCTA();
   ArrayRef<int64_t> shape = type.getShape();
 
@@ -404,19 +418,25 @@ emitBaseIndexForDpasLayout(Location loc, RewriterBase &rewriter,
   // Compute the 2-dim coordinates of the warp containing the tensor element
   // operated on by this thread.
   SmallVector<unsigned> warpShape = dpasLayout.getShapeC();
-  Value rowWarpId = urem(multiDimWarpId[0],
-                         i32_val(mlir::ceil<unsigned>(shape[0], warpShape[0])));
-  Value colWarpId = urem(multiDimWarpId[1],
-                         i32_val(mlir::ceil<unsigned>(shape[1], warpShape[1])));
-  Value rowWarpOffset = mul(rowWarpId, i32_val(warpShape[0]));
-  Value colWarpOffset = mul(colWarpId, i32_val(warpShape[1]));
+  Value rowWarpId =
+      urem(multiDimWarpId[rank - 2],
+           i32_val(mlir::ceil<unsigned>(shape[rank - 2], warpShape[rank - 2])));
+  Value colWarpId =
+      urem(multiDimWarpId[rank - 1],
+           i32_val(mlir::ceil<unsigned>(shape[rank - 1], warpShape[rank - 1])));
+  Value rowWarpOffset = mul(rowWarpId, i32_val(warpShape[rank - 2]));
+  Value colWarpOffset = mul(colWarpId, i32_val(warpShape[rank - 1]));
 
   // Compute the 2-dim coordinates of the first element in the warp operated
   // on by this thread.
   SmallVector<unsigned> threadsPerWarp = getThreadsPerWarp(dpasLayout);
-  SmallVector<Value> multiDimBase = {
-      add(udiv(laneId, i32_val(threadsPerWarp[1])), rowWarpOffset),
-      add(urem(laneId, i32_val(threadsPerWarp[1])), colWarpOffset)};
+  SmallVector<Value> multiDimBase(rank);
+  if (rank == 3)
+    multiDimBase[0] = multiDimWarpId[0];
+  multiDimBase[rank - 2] =
+      add(udiv(laneId, i32_val(threadsPerWarp[rank - 1])), rowWarpOffset);
+  multiDimBase[rank - 1] =
+      add(urem(laneId, i32_val(threadsPerWarp[rank - 1])), colWarpOffset);
   return multiDimBase;
 }