Fix multi batch correctness

leonling-ll · leonling-ll · commit dab71a1fdc0a · 2024-10-27T20:28:58.000Z
diff --git a/python/src/ir.cc b/python/src/ir.cc
@@ -1622,7 +1622,7 @@ void init_triton_ir(py::module &&m) {
              if (haveDump) {
                auto printingFlags = OpPrintingFlags();
                printingFlags.elideLargeElementsAttrs(16);
-               printingFlags.enableDebugInfo();
+               //  printingFlags.enableDebugInfo();
                auto printAlways = [funcToDump](Pass *, Operation *op) -> bool {
                  if (funcToDump.empty())
                    return true;
diff --git a/third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp b/third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp
@@ -180,12 +180,6 @@ DpasEncodingAttr::getElemsPerThread(ArrayRef<int64_t> shape, Type eltTy) const {
   elemsPerThread[rank - 2] = sizePerThread[rank - 2] * tilesRow;
   elemsPerThread[rank - 1] = sizePerThread[rank - 1] * tilesCol;
 
-  // if (rank == 3)
-  //   std::cout << "elemsPerThread: " << elemsPerThread[0] << ", " <<
-  //   elemsPerThread[1] << ", " << elemsPerThread[2] << std::endl;
-  // else
-  //   std::cout << "elemsPerThread: " << elemsPerThread[0] << ", " <<
-  //   elemsPerThread[1] << std::endl;
   return elemsPerThread;
 }
 
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandDPAS.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandDPAS.cpp
@@ -1,6 +1,8 @@
 #include "../TritonGPUToLLVMBase.h"
 #include "../Utility.h"
 #include "mlir/Dialect/LLVMIR/LLVMTypes.h"
+#include "mlir/Support/LLVM.h"
+#include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include "llvm/Support/ErrorHandling.h"
 
 using ValueTable = std::map<std::array<int, 3>, Value>;
@@ -16,16 +18,18 @@ template <unsigned opIdx> class DpasMatmulLoader {
   DpasMatmulLoader(DpasEncodingAttr dpasLayout, MemDescType descTy,
                    unsigned warpsPerTile, ArrayRef<Value> smemStrides,
                    const SmallVector<unsigned> &warpShape,
+                   SmallVector<Value> multiDimWarpId,
                    ConversionPatternRewriter &rewriter,
                    const LLVMTypeConverter *typeConverter, Location loc)
       : dpasLayout(dpasLayout), descTy(descTy), smemStrides(smemStrides),
-        rewriter(rewriter), loc(loc) {
+        multiDimWarpId(multiDimWarpId), rewriter(rewriter), loc(loc) {
     static_assert(opIdx == 0 || opIdx == 1);
 
     size_t rank = warpShape.size();
-    unsigned kDim = (opIdx == 0) ? rank - 1 : rank - 2;
-    unsigned nonKDim = (opIdx == 0) ? rank - 2 : rank - 1;
-    repBatchDimStride = rank == 3 ? smemStrides[0] : i32_val(1);
+    unsigned kDim = opIdx ? rank - 2 : rank - 1;
+    unsigned nonKDim = opIdx ? rank - 1 : rank - 2;
+    // Assume that smem is create with layout offset {2, 1, 0}
+    repBatchDimStride = smemStrides[0];
     repKDimStride = mul(i32_val(warpShape[kDim]), smemStrides[kDim]);
     repNonKDimStride =
         mul(i32_val(warpShape[nonKDim] * warpsPerTile), smemStrides[nonKDim]);
@@ -60,6 +64,7 @@ template <unsigned opIdx> class DpasMatmulLoader {
   MemDescType descTy;
 
   SmallVector<Value> smemStrides;
+  SmallVector<Value> multiDimWarpId;
   Value repBatchDimStride;
   Value repNonKDimStride;
   Value repKDimStride;
@@ -133,6 +138,7 @@ DpasMatmulLoader<opIdx>::computeLdsMatOffs(Value warpId, Value laneId,
   SmallVector<unsigned> instShape = opIdx == 0 ? dpasLayout.getDPASInstShapeA()
                                                : dpasLayout.getDPASInstShapeB();
   ArrayRef<int64_t> shareMemoryShape = descTy.getShape();
+  SmallVector<int64_t> shapePerCTA = getShapePerCTA(descTy);
 
   SmallVector<Value> offs(numPtrs);
   const unsigned repClusterSize =
@@ -160,8 +166,8 @@ DpasMatmulLoader<opIdx>::computeLdsMatOffs(Value warpId, Value laneId,
 
         // round the offset into the tensor's shape limitation. (Rounded
         // broadcast)
-        iBase = urem(iBase, i32_val(shareMemoryShape[0]));
-        jBase = urem(jBase, i32_val(shareMemoryShape[1]));
+        iBase = urem(iBase, i32_val(shareMemoryShape[rank - 2]));
+        jBase = urem(jBase, i32_val(shareMemoryShape[rank - 1]));
 
         // inner index offset
         Value jOff = zeroVal;
@@ -170,10 +176,17 @@ DpasMatmulLoader<opIdx>::computeLdsMatOffs(Value warpId, Value laneId,
         jOff = add(jOff, udiv(cSwizzleOffset, vecVal));
         jOff = mul(xor_(jOff, phase), vecVal);
 
-        Value i = add(mul(iBase, smemStrides[0]), iOff);
-        Value j = add(mul(jBase, smemStrides[1]), jOff);
+        Value i = add(mul(iBase, smemStrides[rank - 2]), iOff);
+        Value j = add(mul(jBase, smemStrides[rank - 1]), jOff);
 
-        offs[index++] = add(i, j);
+        Value baseOff;
+        if (shapePerCTA.size() == 3 && shapePerCTA[0] > 1) {
+          Value batchOffset =
+              mul(multiDimWarpId[0], i32_val(shapePerCTA[1] * shapePerCTA[2]));
+          offs[index++] = add(batchOffset, add(i, j));
+        } else {
+          offs[index++] = add(i, j);
+        }
       }
     }
   }
@@ -190,13 +203,14 @@ Value DpasMatmulLoader<opIdx>::loadMatrix(
       llvm::any_of(structTy.getBody(), [&](Type ty) { return ty == elemTy; }) &&
       "The struct should have the same element types.");
 
-  Value offsetBatch = mul(i32_val(repBatch), repBatchDimStride);
   Value offsetOuter = mul(i32_val(repOuter), repNonKDimStride);
   Value offsetInner = mul(i32_val(repInner), repKDimStride);
   Value offset = add(offsetOuter, offsetInner);
-  // FIXME: repBatchSize and
+  SmallVector<unsigned> warpsPerCTA = dpasLayout.getWarpsPerCTA();
+  // 3DTODO: check if repBatch * warpsPerCTA[0] is correct for the offset.
   if (repBatch > 0) {
-    Value offsetBatch = mul(i32_val(repBatch), repBatchDimStride);
+    Value offsetBatch =
+        mul(i32_val(repBatch * warpsPerCTA[0]), repBatchDimStride);
     offset = add(offset, offsetBatch);
   }
 
@@ -260,7 +274,8 @@ template <unsigned opIdx>
 std::function<void(int, int, int)>
 getLoadMatrixFn(MemDescType descTy, const SharedMemoryObject &smemObj,
                 DpasEncodingAttr dpasLayout, unsigned warpsPerTile,
-                SmallVector<unsigned> instrShape, Value warpId,
+                SmallVector<unsigned> shapePerWarp,
+                SmallVector<Value> multiDimWarpId, Value warpId,
                 Value outerWarpDim, Value laneId, ValueTable &vals,
                 const LLVMTypeConverter *typeConverter,
                 ConversionPatternRewriter &rewriter, Location loc) {
@@ -271,13 +286,14 @@ getLoadMatrixFn(MemDescType descTy, const SharedMemoryObject &smemObj,
 
   auto sharedLayout = cast<SharedEncodingAttr>(descTy.getEncoding());
   ArrayRef<unsigned> order = sharedLayout.getOrder();
+  unsigned rank = order.size();
 
   // (a, b) is the coordinate.
-  auto load = [=, &rewriter, &smemObj, &instrShape, &vals](int batch, int outer,
-                                                           int inner) {
-    DpasMatmulLoader<opIdx> loader(dpasLayout, descTy, warpsPerTile,
-                                   smemObj.strides, instrShape, rewriter,
-                                   typeConverter, loc);
+  auto load = [=, &rewriter, &smemObj, &shapePerWarp, &multiDimWarpId,
+               &vals](int batch, int outer, int inner) {
+    DpasMatmulLoader<opIdx> loader(
+        dpasLayout, descTy, warpsPerTile, smemObj.strides, shapePerWarp,
+        multiDimWarpId, rewriter, typeConverter, loc);
 
     // Offset of a slice within the original tensor in shared memory.
     Value cSwizzleOffset = smemObj.getCSwizzleOffset(order[0]);
@@ -295,7 +311,7 @@ getLoadMatrixFn(MemDescType descTy, const SharedMemoryObject &smemObj,
           gep(ptr_ty(rewriter.getContext(), 3), smemTy, smemBase, offs[i]);
 
     // Load from shared memory.
-    unsigned totalElem = product<unsigned>(instrShape);
+    unsigned totalElem = product<unsigned>(shapePerWarp);
     unsigned threadsPerWarp = product<unsigned>(getThreadsPerWarp(dpasLayout));
     auto matTy = LLVM::LLVMStructType::getLiteral(
         eltTy.getContext(),
@@ -348,8 +364,9 @@ Value loadOperand(ConversionPatternRewriter &rewriter, Location loc,
   // Get the function to use to load the operand.
   ValueTable vals;
   std::function<void(int, int, int)> loadFn = getLoadMatrixFn<opIdx>(
-      descTy, smemObj, dpasLayout, warpsPerTile, std::move(shape), warpId,
-      outerWarpDim, laneId, vals, typeConverter, rewriter, loc);
+      descTy, smemObj, dpasLayout, warpsPerTile, std::move(shape),
+      std::move(multiDimWarpId), warpId, outerWarpDim, laneId, vals,
+      typeConverter, rewriter, loc);
 
   // Load the operand.
   int64_t numRepBatch = numReps[0];
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/DotOpToLLVM/DPAS.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/DotOpToLLVM/DPAS.cpp
@@ -299,7 +299,8 @@ class DotOpDPASConversionHelper {
 
     size_t totalElems = elems.size();
     size_t numElemsPerOperand =
-        totalElems / ((outer * inner) * (repClusterOuter * repClusterInner));
+        totalElems /
+        ((batch * outer * inner) * (repClusterOuter * repClusterInner));
     VectorType dotOpTy = vec_ty(elemTy, numElemsPerOperand);
 
     int offset = 0;
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -349,6 +349,7 @@ struct PrefetchOpConversion
     Type eltTy = tensorType.getElementType();
     const ArrayRef<int64_t> shapeRef = tensorType.getShape();
     SmallVector<int64_t> tensorShape{shapeRef.begin(), shapeRef.end()};
+    assert(tensorShape.size() == 2 && "Only 2D tensors are prefetch supported");
 
     if (!memoryRowMajor) {
       // Swap the shape to make it row major and then get the tiling
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/Utility.h b/third_party/intel/lib/TritonIntelGPUToLLVM/Utility.h
@@ -568,7 +568,6 @@ emitBaseIndexForLayout(Location loc, RewriterBase &rewriter,
 
 inline SmallVector<SmallVector<unsigned>>
 emitOffsetForLayout(Attribute layout, RankedTensorType type) {
-  std::cout << "~! emitOffsetForLayout\n";
   if (auto dpasLayout = dyn_cast<DpasEncodingAttr>(layout))
     return emitOffsetForDpasLayout(dpasLayout, type);
   if (auto dotLayout = dyn_cast<DotOperandEncodingAttr>(layout))
@@ -657,7 +656,9 @@ inline DenseMap<unsigned, Value> getSwizzledSharedPtrs(
   // Order
   auto inOrder = triton::gpu::getOrder(srcEncoding);
   auto outOrder = triton::gpu::getOrder(resSharedLayout);
+  unsigned rank = outOrder.size();
   assert(maxPhase == 1 ||
+         //  outVec * maxPhase <= srcShape[outOrder[rank-2]] &&
          outVec * maxPhase <= srcShape[outOrder[0]] &&
              "Swizzling would generate out of bounds memory accesses");
   // Tensor indices held by the current thread, as LLVM values