Fix index out of range

leonling-ll · leonling-ll · commit 06eef9db837b · 2024-10-25T19:46:54.000Z
diff --git a/third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp b/third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp
@@ -1,5 +1,6 @@
 #include "triton/Dialect/Triton/IR/Dialect.h"
 
+#include <iostream>
 #include <numeric>
 
 #include "intel/include/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.h"
@@ -140,6 +141,8 @@ SmallVector<unsigned> DpasEncodingAttr::getSizePerThread() const {
   unsigned elemsPerThread = elemsNum / threadsPerWarp;
   auto repCluster = getRepCluster();
   // The Value is shard to lanes to threads per DPAS instruction.
+  if (rank == 3)
+    res[0] = repCluster[0];
   res[rank - 2] = elemsPerThread * repCluster[rank - 2];
   res[rank - 1] = repCluster[rank - 1];
   return res;
@@ -164,16 +167,25 @@ DpasEncodingAttr::getElemsPerThread(ArrayRef<int64_t> shape, Type eltTy) const {
   size_t rank = shape.size();
   assert((rank == 2 || rank == 3) && "Unexpected rank of mma layout");
 
-  SmallVector<unsigned> elemsPerThread(rank);
+  SmallVector<unsigned> elemsPerThread(rank, 1);
   auto shapePerCTATile = getShapePerCTATile(shape);
   unsigned tilesRow =
       ceil<unsigned>(shape[rank - 2], shapePerCTATile[rank - 2]);
   unsigned tilesCol =
       ceil<unsigned>(shape[rank - 1], shapePerCTATile[rank - 1]);
   auto sizePerThread = getSizePerThread();
+  if (rank == 3)
+    elemsPerThread[0] =
+        sizePerThread[0] * ceil<unsigned>(shape[0], shapePerCTATile[0]);
   elemsPerThread[rank - 2] = sizePerThread[rank - 2] * tilesRow;
   elemsPerThread[rank - 1] = sizePerThread[rank - 1] * tilesCol;
 
+  // if (rank == 3)
+  //   std::cout << "elemsPerThread: " << elemsPerThread[0] << ", " <<
+  //   elemsPerThread[1] << ", " << elemsPerThread[2] << std::endl;
+  // else
+  //   std::cout << "elemsPerThread: " << elemsPerThread[0] << ", " <<
+  //   elemsPerThread[1] << std::endl;
   return elemsPerThread;
 }
 
@@ -382,14 +394,14 @@ SmallVector<unsigned> DpasEncodingAttr::getContigPerThread() {
   SmallVector<unsigned> contigPerThread(rank, 1);
 
   unsigned threadsPerWarp = getSubGroupSize();
-  auto shapeC = getDPASInstShapeC();
+  auto instShapeC = getDPASInstShapeC();
   // The software vectorization vectorized the value as C array: int a[N] -> int
   // a[N][threadsPerWarp]
-  if (threadsPerWarp > shapeC[1]) {
+  if (threadsPerWarp > instShapeC[1]) {
     return contigPerThread;
-  } else if (threadsPerWarp == shapeC[1]) {
+  } else if (threadsPerWarp == instShapeC[1]) {
     auto repCluster = getRepCluster();
-    contigPerThread[rank - 2] = shapeC[0] * repCluster[rank - 2];
+    contigPerThread[rank - 2] = instShapeC[0] * repCluster[rank - 2];
     return contigPerThread;
   } else {
     // threadsPerWarp < shapeC[1]
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/ConvertLayoutOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/ConvertLayoutOpToLLVM.cpp
@@ -1,6 +1,7 @@
 #include "PatternTritonGPUOpToLLVM.h"
 #include "TargetInfo.h"
 #include "Utility.h"
+#include <iostream>
 
 #include "intel/include/Analysis/Utility.h"
 #include "intel/include/Dialect/TritonIntelGPU/IR/Dialect.h"
@@ -115,12 +116,14 @@ struct ConvertLayoutOpConversion
           loc, rewriter, targetInfo, layout, type, false);
       SmallVector<SmallVector<unsigned>> offsets;
       ::emitOffsetForDpasLayoutPerCTA(
-          dpasLayout, offsets, multiDimCTAInRepId[0] * shapePerCTATile[0],
-          multiDimCTAInRepId[1] * shapePerCTATile[1]);
+          dpasLayout, offsets,
+          multiDimCTAInRepId[rank - 2] * shapePerCTATile[rank - 2],
+          multiDimCTAInRepId[rank - 1] * shapePerCTATile[rank - 1]);
 
       SmallVector<Value> multiDimOffset(rank);
       if (rank == 3)
-        multiDimOffset[0] = multiDimBase[0];
+        multiDimOffset[0] = add(multiDimBase[0], i32_val(multiDimCTAInRepId[0] *
+                                                         shapePerCTATile[0]));
       multiDimOffset[rank - 2] =
           add(multiDimBase[rank - 2], i32_val(offsets[elemId][rank - 2]));
       multiDimOffset[rank - 1] =
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/Utility.h b/third_party/intel/lib/TritonIntelGPUToLLVM/Utility.h
@@ -223,6 +223,7 @@ emitOffsetForDotOpLayout(const DotOperandEncodingAttr &dotLayout,
   unsigned executionSize = dpasLayout.getExecutionSize();
   unsigned opsPerChannel = dpasLayout.getOpsPerChannel();
 
+  unsigned rank = shape.size();
   unsigned numRowsPerPackedValue = 0u, numColsPerPackedValue = 0u;
   unsigned numColsPerLaneForPackedValue = 0u, numOpsPerPackedValue = 0u;
   switch (opIdx) {
@@ -232,7 +233,7 @@ emitOffsetForDotOpLayout(const DotOperandEncodingAttr &dotLayout,
     SmallVector<unsigned> shapeA = dpasLayout.getShapeA();
     // Unlike the operand B, to pack the value to i16 for scalar bit width <=16.
     numOpsPerPackedValue = opsPerChannel == 4 ? 2 : 1;
-    unsigned packedColNum = shapeA[1] / numOpsPerPackedValue;
+    unsigned packedColNum = shapeA[rank - 1] / numOpsPerPackedValue;
     // Each value name represent multiple rows if warpSize > packedColNum
     numRowsPerPackedValue = mlir::ceil(warpSize, packedColNum);
     numColsPerPackedValue = std::min(warpSize, packedColNum);
@@ -256,9 +257,9 @@ emitOffsetForDotOpLayout(const DotOperandEncodingAttr &dotLayout,
   int64_t numRepK = numReps[opIdx ? 1 : 2];
 
   ArrayRef<unsigned> repCluster = dpasLayout.getRepCluster();
-  unsigned repClusterSize = repCluster[opIdx];
+  unsigned repClusterSize = repCluster[opIdx ? rank - 1 : rank - 2];
 
-  for (unsigned dimOuter = 0; dimOuter < numRepOuter; ++dimOuter)
+  for (unsigned repOuter = 0; repOuter < numRepOuter; ++repOuter)
     for (unsigned k = 0; k < numRepK; ++k)
       for (unsigned rep = 0; rep < repClusterSize; ++rep) {
         for (unsigned elemId = 0; elemId < numElemPerInstPerThread; ++elemId) {
@@ -268,9 +269,9 @@ emitOffsetForDotOpLayout(const DotOperandEncodingAttr &dotLayout,
               (opIdx == 0) ? elemId % numOpsPerPackedValue : 0;
           unsigned packedElemId = elemId / numOpsPerPackedValue;
           unsigned repRowIndex =
-              shapePerCTATile[0] * (opIdx == 0 ? dimOuter : k);
+              shapePerCTATile[rank - 2] * (opIdx == 0 ? repOuter : k);
           unsigned repColIndex =
-              shapePerCTATile[1] * (opIdx == 0 ? k : dimOuter);
+              shapePerCTATile[rank - 1] * (opIdx == 0 ? k : repOuter);
           unsigned repClusterRowIndex = opIdx == 0 ? rep * instShape[0] : 0;
           unsigned repClusterColIndex = opIdx == 0 ? 0 : rep * instShape[1];
           unsigned packedElemRowIndex =
@@ -279,10 +280,17 @@ emitOffsetForDotOpLayout(const DotOperandEncodingAttr &dotLayout,
           unsigned packedElemColIndex =
               (packedElemId % numColsPerLaneForPackedValue) *
               numColsPerPackedValue;
-          offsets.push_back({repRowIndex + repClusterRowIndex +
-                                 packedElemRowIndex + opsRowIndex,
-                             repColIndex + repClusterColIndex +
-                                 packedElemColIndex + opsColIndex});
+          if (rank == 3)
+            offsets.push_back({0,
+                               repRowIndex + repClusterRowIndex +
+                                   packedElemRowIndex + opsRowIndex,
+                               repColIndex + repClusterColIndex +
+                                   packedElemColIndex + opsColIndex});
+          else
+            offsets.push_back({repRowIndex + repClusterRowIndex +
+                                   packedElemRowIndex + opsRowIndex,
+                               repColIndex + repClusterColIndex +
+                                   packedElemColIndex + opsColIndex});
         }
       }
 
@@ -560,6 +568,7 @@ emitBaseIndexForLayout(Location loc, RewriterBase &rewriter,
 
 inline SmallVector<SmallVector<unsigned>>
 emitOffsetForLayout(Attribute layout, RankedTensorType type) {
+  std::cout << "~! emitOffsetForLayout\n";
   if (auto dpasLayout = dyn_cast<DpasEncodingAttr>(layout))
     return emitOffsetForDpasLayout(dpasLayout, type);
   if (auto dotLayout = dyn_cast<DotOperandEncodingAttr>(layout))